yaful commited on
Commit
f507f90
1 Parent(s): 568da45

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +279 -0
utils.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ from cleantext import clean
4
+ from itertools import chain
5
+
6
+ class MosesPunctNormalizer:
7
+ """
8
+ This is a Python port of the Moses punctuation normalizer from
9
+ https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
10
+ """
11
+
12
+ EXTRA_WHITESPACE = [ # lines 21 - 30
13
+ (r"\r", r""),
14
+ (r"\(", r" ("),
15
+ (r"\)", r") "),
16
+ (r" +", r" "),
17
+ (r"\) ([.!:?;,])", r")\g<1>"),
18
+ (r"\( ", r"("),
19
+ (r" \)", r")"),
20
+ (r"(\d) %", r"\g<1>%"),
21
+ (r" :", r":"),
22
+ (r" ;", r";"),
23
+ ]
24
+
25
+ NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34
26
+
27
+ NORMALIZE_UNICODE = [ # lines 37 - 50
28
+ ("„", r'"'),
29
+ ("“", r'"'),
30
+ ("”", r'"'),
31
+ ("–", r"-"),
32
+ ("—", r" - "),
33
+ (r" +", r" "),
34
+ ("´", r"'"),
35
+ ("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"),
36
+ ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"),
37
+ ("‘", r"'"),
38
+ ("‚", r"'"),
39
+ ("’", r"'"),
40
+ (r"''", r'"'),
41
+ ("´´", r'"'),
42
+ ("…", r"..."),
43
+ ]
44
+
45
+ FRENCH_QUOTES = [ # lines 52 - 57
46
+ ("\u00A0«\u00A0", r'"'),
47
+ ("«\u00A0", r'"'),
48
+ ("«", r'"'),
49
+ ("\u00A0»\u00A0", r'"'),
50
+ ("\u00A0»", r'"'),
51
+ ("»", r'"'),
52
+ ]
53
+
54
+ HANDLE_PSEUDO_SPACES = [ # lines 59 - 67
55
+ ("\u00A0%", r"%"),
56
+ ("nº\u00A0", "nº "),
57
+ ("\u00A0:", r":"),
58
+ ("\u00A0ºC", " ºC"),
59
+ ("\u00A0cm", r" cm"),
60
+ ("\u00A0\\?", "?"),
61
+ ("\u00A0\\!", "!"),
62
+ ("\u00A0;", r";"),
63
+ (",\u00A0", r", "),
64
+ (r" +", r" "),
65
+ ]
66
+
67
+ EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')]
68
+
69
+ DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
70
+ (r',"', r'",'),
71
+ (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence
72
+ ]
73
+
74
+ DE_ES_CZ_CS_FR = [
75
+ ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"),
76
+ ]
77
+
78
+ OTHER = [
79
+ ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"),
80
+ ]
81
+
82
+ # Regex substitutions from replace-unicode-punctuation.perl
83
+ # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
84
+ REPLACE_UNICODE_PUNCTUATION = [
85
+ (",", ","),
86
+ (r"。\s*", ". "),
87
+ ("、", ","),
88
+ ("”", '"'),
89
+ ("“", '"'),
90
+ ("∶", ":"),
91
+ (":", ":"),
92
+ ("?", "?"),
93
+ ("《", '"'),
94
+ ("》", '"'),
95
+ (")", ")"),
96
+ ("!", "!"),
97
+ ("(", "("),
98
+ (";", ";"),
99
+ ("」", '"'),
100
+ ("「", '"'),
101
+ ("0", "0"),
102
+ ("1", "1"),
103
+ ("2", "2"),
104
+ ("3", "3"),
105
+ ("4", "4"),
106
+ ("5", "5"),
107
+ ("6", "6"),
108
+ ("7", "7"),
109
+ ("8", "8"),
110
+ ("9", "9"),
111
+ (r".\s*", ". "),
112
+ ("~", "~"),
113
+ ("’", "'"),
114
+ ("…", "..."),
115
+ ("━", "-"),
116
+ ("〈", "<"),
117
+ ("〉", ">"),
118
+ ("【", "["),
119
+ ("】", "]"),
120
+ ("%", "%"),
121
+ ]
122
+
123
+ def __init__(
124
+ self,
125
+ lang="en",
126
+ penn=True,
127
+ norm_quote_commas=True,
128
+ norm_numbers=True,
129
+ pre_replace_unicode_punct=False,
130
+ post_remove_control_chars=False,
131
+ ):
132
+ """
133
+ :param language: The two-letter language code.
134
+ :type lang: str
135
+ :param penn: Normalize Penn Treebank style quotations.
136
+ :type penn: bool
137
+ :param norm_quote_commas: Normalize quotations and commas
138
+ :type norm_quote_commas: bool
139
+ :param norm_numbers: Normalize numbers
140
+ :type norm_numbers: bool
141
+ """
142
+ self.substitutions = [
143
+ self.EXTRA_WHITESPACE,
144
+ self.NORMALIZE_UNICODE,
145
+ self.FRENCH_QUOTES,
146
+ self.HANDLE_PSEUDO_SPACES,
147
+ ]
148
+
149
+ if penn: # Adds the penn substitutions after extra_whitespace regexes.
150
+ self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN)
151
+
152
+ if norm_quote_commas:
153
+ if lang == "en":
154
+ self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA)
155
+ elif lang in ["de", "es", "fr"]:
156
+ self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
157
+
158
+ if norm_numbers:
159
+ if lang in ["de", "es", "cz", "cs", "fr"]:
160
+ self.substitutions.append(self.DE_ES_CZ_CS_FR)
161
+ else:
162
+ self.substitutions.append(self.OTHER)
163
+
164
+ self.substitutions = list(chain(*self.substitutions))
165
+
166
+ self.pre_replace_unicode_punct = pre_replace_unicode_punct
167
+ self.post_remove_control_chars = post_remove_control_chars
168
+
169
+ def normalize(self, text):
170
+ """
171
+ Returns a string with normalized punctuation.
172
+ """
173
+ # Optionally, replace unicode puncts BEFORE normalization.
174
+ if self.pre_replace_unicode_punct:
175
+ text = self.replace_unicode_punct(text)
176
+
177
+ # Actual normalization.
178
+ for regexp, substitution in self.substitutions:
179
+ # print(regexp, substitution)
180
+ text = re.sub(regexp, substitution, str(text))
181
+ # print(text)
182
+
183
+ # Optionally, replace unicode puncts BEFORE normalization.
184
+ if self.post_remove_control_chars:
185
+ text = self.remove_control_chars(text)
186
+
187
+ return text.strip()
188
+
189
+ def replace_unicode_punct(self, text):
190
+ for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION:
191
+ text = re.sub(regexp, substitution, str(text))
192
+ return text
193
+
194
+ def remove_control_chars(self, text):
195
+ return regex.sub(r"\p{C}", "", text)
196
+
197
+ def _tokenization_norm(text):
198
+ text = text.replace(
199
+ ' ,', ',').replace(
200
+ ' .', '.').replace(
201
+ ' ?', '?').replace(
202
+ ' !', '!').replace(
203
+ ' ;', ';').replace(
204
+ ' \'', '\'').replace(
205
+ ' ’ ', '\'').replace(
206
+ ' :', ':').replace(
207
+ '<newline>', '\n').replace(
208
+ '`` ', '"').replace(
209
+ ' \'\'', '"').replace(
210
+ '\'\'', '"').replace(
211
+ '.. ', '... ').replace(
212
+ ' )', ')').replace(
213
+ '( ', '(').replace(
214
+ ' n\'t', 'n\'t').replace(
215
+ ' i ', ' I ').replace(
216
+ ' i\'', ' I\'').replace(
217
+ '\\\'', '\'').replace(
218
+ '\n ', '\n').strip()
219
+ return text
220
+
221
+
222
+ def _clean_text(text):
223
+ # remove PLM special tokens
224
+ plm_special_tokens = r'(\<pad\>)|(\<s\>)|(\<\/s\>)|(\<unk\>)|(\<\|endoftext\|\>)'
225
+ text = re.sub(plm_special_tokens, "", text)
226
+
227
+ # normalize puncuations
228
+ moses_norm = MosesPunctNormalizer()
229
+ text = moses_norm.normalize(text)
230
+
231
+ # normalize tokenization
232
+ text = _tokenization_norm(text)
233
+
234
+ # remove specific text patterns, e.g,, url, email and phone number
235
+ text = clean(text,
236
+ fix_unicode=True, # fix various unicode errors
237
+ to_ascii=True, # transliterate to closest ASCII representation
238
+ lower=False, # lowercase text
239
+ no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
240
+ no_urls=True, # replace all URLs with a special token
241
+ no_emails=True, # replace all email addresses with a special token
242
+ no_phone_numbers=True, # replace all phone numbers with a special token
243
+ no_numbers=False, # replace all numbers with a special token
244
+ no_digits=False, # replace all digits with a special token
245
+ no_currency_symbols=False, # replace all currency symbols with a special token
246
+ no_punct=False, # remove punctuations
247
+ replace_with_punct="", # instead of removing punctuations you may replace them
248
+ replace_with_url="",
249
+ replace_with_email="",
250
+ replace_with_phone_number="",
251
+ replace_with_number="<NUMBER>",
252
+ replace_with_digit="<DIGIT>",
253
+ replace_with_currency_symbol="<CUR>",
254
+ lang="en" # set to 'de' for German special handling
255
+ )
256
+
257
+ # keep common puncts only
258
+ punct_pattern = r'[^ A-Za-z0-9.?!,:;\-\[\]\{\}\(\)\'\"]'
259
+ text = re.sub(punct_pattern, '', text)
260
+ # remove specific patterns
261
+ spe_pattern = r'[-\[\]\{\}\(\)\'\"]{2,}'
262
+ text = re.sub(spe_pattern, '', text)
263
+ # remove redundate spaces
264
+ text = " ".join(text.split())
265
+ return text
266
+
267
+ def _rm_line_break(text):
268
+ text = text.replace("\n","\\n")
269
+ text = re.sub(r'(?:\\n)*\\n', r'\\n', text)
270
+ text = re.sub(r'^.{0,3}\\n', '', text)
271
+ text = text.replace("\\n"," ")
272
+ return text
273
+
274
+ def preprocess(text):
275
+ text = _rm_line_break(text)
276
+ text = _clean_text(text)
277
+ return text
278
+
279
+