musfiqdehan commited on
Commit
407b426
1 Parent(s): d9b70f3

Syncing huggingface space and github

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ .venv/
README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Bangla PoS Taggers
3
+ emoji: 🌼
4
+ colorFrom: red
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.19.2
8
+ app_file: app.py
9
+ pinned: true
10
+ license: mit
11
+ short_description: Parts of Speech Tagging of Bangla Sentence
12
+ ---
13
+
14
+
15
+ This demo is related to the paper named "[Word Alignment by Fine-tuning Embeddings on Parallel Corpora](https://arxiv.org/abs/2101.08231)"
16
+ ```
17
+ @inproceedings{dou2021word,
18
+ title={Word Alignment by Fine-tuning Embeddings on Parallel Corpora},
19
+ author={Dou, Zi-Yi and Neubig, Graham},
20
+ booktitle={Conference of the European Chapter of the Association for Computational Linguistics (EACL)},
21
+ year={2021}
22
+ }
23
+ ```
24
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_rich_textbox import RichTextbox
3
+
4
+ from helper.text_preprocess import space_punc
5
+ from helper.pos_taggers import select_pos_tagger
6
+ from helper.translators import select_translator
7
+
8
+
9
+ def bn_postagger(src, translator, tagger):
10
+ """
11
+ Bangla PoS Tagger
12
+ """
13
+
14
+ src = space_punc(src)
15
+
16
+ tgt_base, tgt = select_translator(src, translator)
17
+
18
+ result, pos_accuracy = select_pos_tagger(src, tgt, tagger)
19
+
20
+ return tgt_base, result, pos_accuracy
21
+
22
+
23
+ # Define the Gradio interface
24
+ # demo = gr.Interface(
25
+ # fn=bn_postagger,
26
+ # inputs=[
27
+ # gr.Textbox(label="Enter Bangla Sentence", placeholder="বাংলা বাক্য লিখুন"),
28
+ # gr.Dropdown(["Google", "BanglaNMT", "MyMemory"], label="Select a Translator"),
29
+ # gr.Dropdown(["spaCy", "NLTK", "Flair", "TextBlob"], label="Select a PoS Tagger")
30
+ # ],
31
+ # outputs= [
32
+ # gr.Textbox(label="English Translation"),
33
+ # RichTextbox(label="PoS Tags"),
34
+ # gr.Textbox(label="Overall PoS Tagging Accuracy")
35
+ # ],
36
+ # live=False,
37
+ # title="Bangla PoS Taggers",
38
+ # theme='',
39
+ # examples=[
40
+ # ["বাংলাদেশ দক্ষিণ এশিয়ার একটি সার্বভৌম রাষ্ট্র।"],
41
+ # ["বাংলাদেশের সংবিধানিক নাম কি?"],
42
+ # ["বাংলাদেশের সাংবিধানিক নাম গণপ্রজাতন্ত্রী বাংলাদেশ।"],
43
+ # ["তিনজনের কেউই বাবার পথ ধরে প্রযুক্তি দুনিয়ায় হাঁটেননি।"],
44
+ # ["বিশ্বের আরও একটি সেরা ক্লাব।"]
45
+
46
+ # ]
47
+ # )
48
+
49
+ with gr.Blocks(css="styles.css") as demo:
50
+ gr.HTML("<h1>Bangla PoS Taggers</h1>")
51
+ gr.HTML("<p>Parts of Speech (PoS) Tagging of Bangla Sentence using Bangla-English <strong>Word Alignment</strong></p>")
52
+
53
+ with gr.Row():
54
+ with gr.Column():
55
+ inputs = [
56
+ gr.Textbox(
57
+ label="Enter Bangla Sentence",
58
+ placeholder="বাংলা বাক্য লিখুন"
59
+ ),
60
+ gr.Dropdown(
61
+ choices=["Google", "BanglaNMT", "MyMemory"],
62
+ label="Select a Translator"
63
+ ),
64
+ gr.Dropdown(
65
+ choices=["spaCy", "NLTK", "Flair", "TextBlob"],
66
+ label="Select a PoS Tagger"
67
+ )
68
+ ]
69
+
70
+ btn = gr.Button(value="Submit", elem_classes="mybtn")
71
+ gr.ClearButton(inputs)
72
+
73
+ with gr.Column():
74
+ outputs = [
75
+ gr.Textbox(label="English Translation"),
76
+ RichTextbox(label="PoS Tags"),
77
+ gr.Textbox(label="Overall PoS Tagging Accuracy")
78
+ ]
79
+
80
+ btn.click(bn_postagger, inputs, outputs)
81
+
82
+ gr.Examples([
83
+ ["বাংলাদেশ দক্ষিণ এশিয়ার একটি সার্বভৌম রাষ্ট্র।", "Google", "NLTK"],
84
+ ["বাংলাদেশের সংবিধানিক নাম কি?", "Google", "spaCy"],
85
+ ["বাংলাদেশের সাংবিধানিক নাম গণপ্রজাতন্ত্রী বাংলাদেশ।", "Google", "TextBlob"],
86
+ ["তিনজনের কেউই বাবার পথ ধরে প্রযুক্তি দুনিয়ায় হাঁটেননি।", "Google", "spaCy"],
87
+ ["তিনজনের কেউই বাবার পথ ধরে প্রযুক্তি দুনিয়ায় হাঁটেননি।", "BanglaNMT", "spaCy"],
88
+ ["তিনজনের কেউই বাবার পথ ধরে প্রযুক্তি দুনিয়ায় হাঁটেননি।", "MyMemory", "spaCy"],
89
+ ["বিশ্বের আরও একটি সেরা ক্লাব।", "Google", "Flair"]
90
+
91
+ ], inputs)
92
+
93
+
94
+
95
+ # Launch the Gradio app
96
+ if __name__ == "__main__":
97
+ demo.launch()
init.py → helper/__init__.py RENAMED
File without changes
helper/alignment_mappers.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the helper functions to get the word alignment mapping between two sentences.
3
+ """
4
+
5
+ import torch
6
+ import itertools
7
+ import transformers
8
+ from transformers import logging
9
+
10
+ # Set the verbosity to error, so that the warning messages are not printed
11
+ logging.set_verbosity_warning()
12
+ logging.set_verbosity_error()
13
+
14
+
15
+ def get_alignment_mapping(source="", target="", model_path="musfiqdehan/bn-en-word-aligner"):
16
+ """
17
+ Get Aligned Words
18
+ """
19
+ model = transformers.BertModel.from_pretrained(model_path)
20
+ tokenizer = transformers.BertTokenizer.from_pretrained(model_path)
21
+
22
+ # pre-processing
23
+ sent_src, sent_tgt = source.strip().split(), target.strip().split()
24
+ token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [
25
+ tokenizer.tokenize(word) for word in sent_tgt]
26
+ wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [
27
+ tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
28
+ ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)[
29
+ 'input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
30
+ sub2word_map_src = []
31
+
32
+ for i, word_list in enumerate(token_src):
33
+ sub2word_map_src += [i for x in word_list]
34
+
35
+ sub2word_map_tgt = []
36
+
37
+ for i, word_list in enumerate(token_tgt):
38
+ sub2word_map_tgt += [i for x in word_list]
39
+
40
+ # alignment
41
+ align_layer = 8
42
+
43
+ threshold = 1e-3
44
+
45
+ model.eval()
46
+
47
+ with torch.no_grad():
48
+ out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[
49
+ 2][align_layer][0, 1:-1]
50
+ out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[
51
+ 2][align_layer][0, 1:-1]
52
+
53
+ dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
54
+
55
+ softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
56
+ softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
57
+
58
+ softmax_inter = (softmax_srctgt > threshold) * \
59
+ (softmax_tgtsrc > threshold)
60
+
61
+ align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
62
+
63
+ align_words = set()
64
+
65
+ for i, j in align_subwords:
66
+ align_words.add((sub2word_map_src[i], sub2word_map_tgt[j]))
67
+
68
+ return sent_src, sent_tgt, align_words
69
+
70
+
71
+
72
+ def get_word_mapping(source="", target="", model_path="musfiqdehan/bn-en-word-aligner"):
73
+ """
74
+ Get Word Aligned Mapping Words
75
+ """
76
+ sent_src, sent_tgt, align_words = get_alignment_mapping(
77
+ source=source, target=target, model_path=model_path)
78
+
79
+ result = []
80
+
81
+ for i, j in sorted(align_words):
82
+ result.append(f'bn:({sent_src[i]}) -> en:({sent_tgt[j]})')
83
+
84
+ return result
85
+
86
+
87
+
88
+ def get_word_index_mapping(source="", target="", model_path="musfiqdehan/bn-en-word-aligner"):
89
+ """
90
+ Get Word Aligned Mapping Index
91
+ """
92
+ sent_src, sent_tgt, align_words = get_alignment_mapping(
93
+ source=source, target=target, model_path=model_path)
94
+
95
+ result = []
96
+
97
+ for i, j in sorted(align_words):
98
+ result.append(f'bn:({i}) -> en:({j})')
99
+
100
+ return result
helper/pos_taggers.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the functions to get PoS tags using Spacy and return a Markdown table
3
+ """
4
+
5
+ from .alignment_mappers import get_alignment_mapping
6
+
7
+ from flair.models import SequenceTagger
8
+ from flair.data import Sentence
9
+
10
+ import spacy
11
+ from spacy.cli import download
12
+ download("en_core_web_sm")
13
+ import en_core_web_sm
14
+
15
+ import nltk
16
+ nltk.download('punkt')
17
+ nltk.download('averaged_perceptron_tagger')
18
+
19
+ from textblob import TextBlob
20
+
21
+
22
+ def get_spacy_postag_dict(target=""):
23
+ '''
24
+ Get spacy pos tags
25
+ '''
26
+ nlp = en_core_web_sm.load()
27
+ target_tokenized = nlp(target)
28
+ spacy_postag_dict = dict((token.text, token.tag_)
29
+ for token in target_tokenized)
30
+ return spacy_postag_dict
31
+
32
+ def get_nltk_postag_dict(target=""):
33
+ '''
34
+ Get nltk pos tags
35
+ '''
36
+ target_tokenized = nltk.tokenize.word_tokenize(target)
37
+ nltk_postag_dict = dict((key, value)
38
+ for key, value in nltk.pos_tag(target_tokenized))
39
+ return nltk_postag_dict
40
+
41
+ def get_flair_postag_dict(target=""):
42
+ '''
43
+ Get flair pos tags
44
+ '''
45
+ tagger = SequenceTagger.load("pos")
46
+ target_tokenized = Sentence(target)
47
+ tagger.predict(target_tokenized)
48
+ flair_postag_dict = dict((token.text, token.tag)
49
+ for token in target_tokenized)
50
+ return flair_postag_dict
51
+
52
+ def get_textblob_postag_dict(target=""):
53
+ '''
54
+ Get textblob pos tags
55
+ '''
56
+ blob = TextBlob(target)
57
+ textblob_postag_dict = dict(blob.tags)
58
+ return textblob_postag_dict
59
+
60
+ def get_postag(
61
+ get_postag_dict,
62
+ source="",
63
+ target="",
64
+ model_path="musfiqdehan/bn-en-word-aligner"):
65
+ """Get Spacy PoS Tags and return a Markdown table"""
66
+
67
+ sent_src, sent_tgt, align_words = get_alignment_mapping(
68
+ source=source, target=target, model_path=model_path
69
+ )
70
+ postag_dict = get_postag_dict(target=target)
71
+
72
+ mapped_sent_src = []
73
+
74
+ html_table = '''
75
+ <table>
76
+ <thead>
77
+ <th>Bangla</th>
78
+ <th>English</th>
79
+ <th>PoS Tags</th>
80
+ </thead>
81
+ '''
82
+
83
+ for i, j in sorted(align_words):
84
+ punc = r"""!()-[]{}।;:'"\,<>./?@#$%^&*_~"""
85
+ if sent_src[i] in punc or sent_tgt[j] in punc:
86
+ mapped_sent_src.append(sent_src[i])
87
+
88
+ html_table += f'''
89
+ <tbody>
90
+ <tr>
91
+ <td> {sent_src[i]} </td>
92
+ <td> {sent_tgt[j]} </td>
93
+ <td> PUNC </td>
94
+ </tr>
95
+ '''
96
+ else:
97
+ mapped_sent_src.append(sent_src[i])
98
+
99
+ html_table += f'''
100
+ <tr>
101
+ <td> {sent_src[i]} </td>
102
+ <td> {sent_tgt[j]} </td>
103
+ <td> {postag_dict[sent_tgt[j]]} </td>
104
+ </tr>
105
+ '''
106
+
107
+ unks = list(set(sent_src).difference(set(mapped_sent_src)))
108
+ for word in unks:
109
+
110
+ html_table += f'''
111
+ <tr>
112
+ <td> {word} </td>
113
+ <td> N/A </td>
114
+ <td> UNK </td>
115
+ </tr>
116
+ '''
117
+
118
+ html_table += '''
119
+ </tbody>
120
+ </table>
121
+ '''
122
+
123
+ pos_accuracy = ((len(sent_src) - len(unks)) / len(sent_src))
124
+ pos_accuracy = f"{pos_accuracy:0.2%}"
125
+
126
+ return html_table, pos_accuracy
127
+
128
+
129
+ def select_pos_tagger(src, tgt, tagger):
130
+ '''
131
+ Select the PoS tagger
132
+ '''
133
+
134
+ result = None
135
+ pos_accuracy = None
136
+
137
+ if tagger == "spaCy":
138
+ result, pos_accuracy = get_postag(
139
+ get_spacy_postag_dict,
140
+ source=src,
141
+ target=tgt,
142
+ model_path="musfiqdehan/bn-en-word-aligner",
143
+ )
144
+ elif tagger == "NLTK":
145
+ result, pos_accuracy = get_postag(
146
+ get_nltk_postag_dict,
147
+ source=src,
148
+ target=tgt,
149
+ model_path="musfiqdehan/bn-en-word-aligner",
150
+ )
151
+ elif tagger == "Flair":
152
+ result, pos_accuracy = get_postag(
153
+ get_flair_postag_dict,
154
+ source=src,
155
+ target=tgt,
156
+ model_path="musfiqdehan/bn-en-word-aligner",
157
+ )
158
+ elif tagger == "TextBlob":
159
+ result, pos_accuracy = get_postag(
160
+ get_textblob_postag_dict,
161
+ source=src,
162
+ target=tgt,
163
+ model_path="musfiqdehan/bn-en-word-aligner",
164
+ )
165
+ return result, pos_accuracy
helper/text_preprocess.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file contains functions for text preprocessing
3
+ """
4
+
5
+ import re
6
+
7
+
8
+ def decontracting_words(sentence):
9
+ """
10
+ Decontracting words (e.g. I'm -> I am, I've -> I have, etc.)
11
+ https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
12
+ https://stackoverflow.com/a/19794953
13
+ """
14
+ contractions = {
15
+ "ain't": "am not",
16
+ "aren't": "are not",
17
+ "can't": "can not",
18
+ "can't've": "can not have",
19
+ "'cause": "because",
20
+ "could've": "could have",
21
+ "couldn't": "could not",
22
+ "couldn't've": "could not have",
23
+ "didn't": "did not",
24
+ "doesn't": "does not",
25
+ "don't": "do not",
26
+ "hadn't": "had not",
27
+ "hadn't've": "had not have",
28
+ "hasn't": "has not",
29
+ "haven't": "have not",
30
+ "he'd": "he would",
31
+ "he'd've": "he would have",
32
+ "he'll": "he will",
33
+ "he'll've": "he will have",
34
+ "he's": "he is",
35
+ "how'd": "how did",
36
+ "how'd'y": "how do you",
37
+ "how'll": "how will",
38
+ "how's": "how is",
39
+ "i'd": "i would",
40
+ "i'd've": "i would have",
41
+ "i'll": "i will",
42
+ "i'll've": "i will have",
43
+ "i'm": "i am",
44
+ "i've": "i have",
45
+ "isn't": "is not",
46
+ "it'd": "it would",
47
+ "it'd've": "it would have",
48
+ "it'll": "it will",
49
+ "it'll've": "it will have",
50
+ "it's": "it is",
51
+ "let's": "let us",
52
+ "ma'am": "madam",
53
+ "mayn't": "may not",
54
+ "might've": "might have",
55
+ "mightn't": "might not",
56
+ "mightn't've": "might not have",
57
+ "must've": "must have",
58
+ "mustn't": "must not",
59
+ "mustn't've": "must not have",
60
+ "needn't": "need not",
61
+ "needn't've": "need not have",
62
+ "o'clock": "of the clock",
63
+ "oughtn't": "ought not",
64
+ "oughtn't've": "ought not have",
65
+ "shan't": "shall not",
66
+ "sha'n't": "shall not",
67
+ "shan't've": "shall not have",
68
+ "she'd": "she would",
69
+ "she'd've": "she would have",
70
+ "she'll": "she will",
71
+ "she'll've": "she will have",
72
+ "she's": "she is",
73
+ "should've": "should have",
74
+ "shouldn't": "should not",
75
+ "shouldn't've": "should not have",
76
+ "so've": "so have",
77
+ "so's": "so as",
78
+ "that'd": "that would",
79
+ "that'd've": "that would have",
80
+ "that's": "that is",
81
+ "there'd": "there would",
82
+ "there'd've": "there would have",
83
+ "there's": "there is",
84
+ "they'd": "they would",
85
+ "they'd've": "they would have",
86
+ "they'll": "they will",
87
+ "they'll've": "they will have",
88
+ "they're": "they are",
89
+ "they've": "they have",
90
+ "to've": "to have",
91
+ "wasn't": "was not",
92
+ "we'd": "we would",
93
+ "we'd've": "we would have",
94
+ "we'll": "we will",
95
+ "we'll've": "we will have",
96
+ "we're": "we are",
97
+ "we've": "we have",
98
+ "weren't": "were not",
99
+ "what'll": "what will",
100
+ "what'll've": "what will have",
101
+ "what're": "what are",
102
+ "what's": "what is",
103
+ "what've": "what have",
104
+ "when's": "when is",
105
+ "when've": "when have",
106
+ "where'd": "where did",
107
+ "where's": "where is",
108
+ "where've": "where have",
109
+ "who'll": "who will",
110
+ "who'll've": "who will have",
111
+ "who's": "who is",
112
+ "who've": "who have",
113
+ "why's": "why is",
114
+ "why've": "why have",
115
+ "will've": "will have",
116
+ "won't": "will not",
117
+ "won't've": "will not have",
118
+ "would've": "would have",
119
+ "wouldn't": "would not",
120
+ "wouldn't've": "would not have",
121
+ "y'all": "you all",
122
+ "y'all'd": "you all would",
123
+ "y'all'd've": "you all would have",
124
+ "y'all're": "you all are",
125
+ "y'all've": "you all have",
126
+ "you'd": "you would",
127
+ "you'd've": "you would have",
128
+ "you'll": "you will",
129
+ "you'll've": "you will have",
130
+ "you're": "you are",
131
+ "you've": "you have"
132
+ }
133
+
134
+ sentence_decontracted = []
135
+
136
+ for word in sentence.split():
137
+ if word in contractions:
138
+ word = contractions[word]
139
+
140
+ sentence_decontracted.append(word)
141
+
142
+ sentence = ' '.join(sentence_decontracted)
143
+ sentence = sentence.replace("'ve", " have")
144
+ sentence = sentence.replace("n't", " not")
145
+ sentence = sentence.replace("'re", " are")
146
+ sentence = sentence.replace("'ll", " will")
147
+ sentence = sentence.replace("'d", " would")
148
+ sentence = sentence.replace("'s", " is")
149
+ sentence = sentence.replace("'m", " am")
150
+
151
+ return sentence
152
+
153
+
154
+
155
+ def space_punc(line):
156
+ """
157
+ Add a space before and after a punctuation mark
158
+ and remove more than one space
159
+ print(space_punc('bla. bla? "bla"bla.bla! bla...'))
160
+ >> bla . bla ? " bla " bla . bla ! bla . . .
161
+ """
162
+
163
+ line = re.sub('([.,:;\-।!?"()\'])', r" \1 ", line)
164
+ line = re.sub("\s{2,}", " ", line)
165
+ return line
helper/translators.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file contains the functions to translate the text from one language to another.
3
+ """
4
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
+ from deep_translator import GoogleTranslator, MyMemoryTranslator, MicrosoftTranslator, YandexTranslator, ChatGptTranslator
6
+ from .text_preprocess import decontracting_words, space_punc
7
+ from dotenv import load_dotenv
8
+ import os
9
+
10
+
11
+ # Load the environment variables from the .env file
12
+ load_dotenv()
13
+
14
+ # Translators API Keys
15
+ MICROSOFT_API_KEY = os.getenv("MICROSOFT_TRANSLATOR_KEY")
16
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
+ YANDEX_API_KEY = os.getenv("YANDEX_API_KEY")
18
+
19
+ # Digit Translation
20
+ digit_converter = {
21
+ '০': '0',
22
+ '১': '1',
23
+ '২': '2',
24
+ '৩': '3',
25
+ '৪': '4',
26
+ '৫': '5',
27
+ '৬': '6',
28
+ '৭': '7',
29
+ '৮': '8',
30
+ '৯': '9'
31
+ }
32
+
33
+
34
+ def get_translated_digit(sentence):
35
+ """
36
+ Translate the digits from Bengali to English
37
+ """
38
+ translated_sentence = []
39
+ for each_letter in sentence:
40
+ if each_letter in digit_converter.keys():
41
+ translated_sentence.append(digit_converter[each_letter])
42
+ # print(digit_converter[each_letter], end="")
43
+ else:
44
+ translated_sentence.append(each_letter)
45
+ # print(each_letter, end="")
46
+
47
+ return "".join(each for each in translated_sentence)
48
+
49
+ # Bangla to English Translation (BUET BanglaNMT)
50
+ translation_model_bn_en = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
51
+ translation_tokenizer_bn_en = AutoTokenizer.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
52
+
53
+ def banglanmt_translation(input_text):
54
+ """
55
+ Translate a sentence from Bengali to English using BUET BanglaNMT
56
+ """
57
+ inputs = translation_tokenizer_bn_en(input_text, return_tensors="pt")
58
+ outputs = translation_model_bn_en.generate(**inputs)
59
+ translated_text = translation_tokenizer_bn_en.decode(outputs[0], skip_special_tokens=True)
60
+ return translated_text
61
+
62
+ def google_translation(sentence: str, source="bn", target="en") -> str:
63
+ """
64
+ Translate a sentence from one language to another using Google Translator.\n
65
+ At first install dependencies \n
66
+ `!pip install -U deep-translator`
67
+ """
68
+ translator = GoogleTranslator()
69
+ translated_sentence = translator.translate(
70
+ sentence, source=source, target=target)
71
+ return translated_sentence
72
+
73
+ def microsoft_translation(sentence: str, source="bn", target="en") -> str:
74
+ """
75
+ Translate a sentence from one language to another using Microsoft Translator.\n
76
+ At first install dependencies \n
77
+ `!pip install -U deep-translator`
78
+ """
79
+ translator = MicrosoftTranslator(api_key=MICROSOFT_API_KEY, target='en')
80
+ translated_sentence = translator.translate(sentence)
81
+ return translated_sentence
82
+
83
+ def chatgpt_translation(sentence: str, source="bn", target="en") -> str:
84
+ """
85
+ Translate a sentence from one language to another using ChatGPT Translator.\n
86
+ At first install dependencies \n
87
+ `!pip install -U deep-translator`
88
+ """
89
+ translator = ChatGptTranslator(api_key=OPENAI_API_KEY, target=target)
90
+ translated_sentence = translator.translate(sentence)
91
+ return translated_sentence
92
+
93
+ def yandex_translation(sentence: str, source="bn", target="en") -> str:
94
+ """
95
+ Translate a sentence from one language to another using Yandex Translator.\n
96
+ At first install dependencies \n
97
+ `!pip install -U deep-translator`
98
+ """
99
+ translator = YandexTranslator(api_key=YANDEX_API_KEY)
100
+ translated_sentence = translator.translate(
101
+ sentence, source=source, target=target)
102
+ return translated_sentence
103
+
104
+ def mymemory_translation(sentence: str, source="bn-IN", target="en-US") -> str:
105
+ """
106
+ Translate a sentence from one language to another using MyMemory Translator.\n
107
+ At first install dependencies \n
108
+ `!pip install -U deep-translator`
109
+ """
110
+ translator = MyMemoryTranslator(source=source, target=target)
111
+ translated_sentence = translator.translate(sentence)
112
+ return translated_sentence
113
+
114
+ def get_better_translation(translator_func, src=""):
115
+ src_mod = get_translated_digit(src)
116
+ tgt = translator_func(src_mod)
117
+ tgt = decontracting_words(tgt)
118
+ tgt = tgt.replace('rupees', 'takas').replace('Rs', 'takas')
119
+ return tgt
120
+
121
+ def select_translator(src, translator):
122
+ """
123
+ Select the translator
124
+ """
125
+ tgt = None
126
+ tgt_base = None
127
+
128
+ if translator == "Google":
129
+ tgt = get_better_translation(google_translation, src)
130
+ tgt = space_punc(tgt)
131
+ tgt_base = google_translation(src)
132
+ elif translator == "BanglaNMT":
133
+ tgt = get_better_translation(banglanmt_translation, src)
134
+ tgt = space_punc(tgt)
135
+ tgt_base = banglanmt_translation(src)
136
+ elif translator == "MyMemory":
137
+ tgt = get_better_translation(mymemory_translation, src)
138
+ tgt = space_punc(tgt)
139
+ tgt_base = mymemory_translation(src)
140
+
141
+ return tgt_base, tgt
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ sentencepiece
3
+ transformers
4
+ spacy
5
+ flair
6
+ nltk
7
+ textblob
8
+ deep-translator
9
+ pandas
10
+ gradio_rich_textbox
11
+ python-dotenv
styles.css ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @import url("https://fonts.googleapis.com/css2?family=Merriweather:wght@400;700;900&display=swap");
2
+
3
+ h1 {
4
+ font-family: "Merriweather", serif;
5
+ text-align: center;
6
+ font-weight: 700;
7
+ }
8
+
9
+ p {
10
+ text-align: center;
11
+ }
12
+
13
+ .mybtn {
14
+ background-color: rgb(240,98,16)!important;
15
+ }
16
+
17
+ table {
18
+ border: 1px solid gray;
19
+ border-collapse: collapse;
20
+ text-align: center;
21
+ width: 100%;
22
+ }
23
+
24
+ th,
25
+ td {
26
+ border: 1px solid gray;
27
+ border-collapse: collapse;
28
+ padding: 5px;
29
+ }