import re import torch from cleantext import clean from itertools import chain class MosesPunctNormalizer: """ This is a Python port of the Moses punctuation normalizer from https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl """ EXTRA_WHITESPACE = [ # lines 21 - 30 (r"\r", r""), (r"\(", r" ("), (r"\)", r") "), (r" +", r" "), (r"\) ([.!:?;,])", r")\g<1>"), (r"\( ", r"("), (r" \)", r")"), (r"(\d) %", r"\g<1>%"), (r" :", r":"), (r" ;", r";"), ] NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34 NORMALIZE_UNICODE = [ # lines 37 - 50 ("„", r'"'), ("“", r'"'), ("”", r'"'), ("–", r"-"), ("—", r" - "), (r" +", r" "), ("´", r"'"), ("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"), ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"), ("‘", r"'"), ("‚", r"'"), ("’", r"'"), (r"''", r'"'), ("´´", r'"'), ("…", r"..."), ] FRENCH_QUOTES = [ # lines 52 - 57 ("\u00A0«\u00A0", r'"'), ("«\u00A0", r'"'), ("«", r'"'), ("\u00A0»\u00A0", r'"'), ("\u00A0»", r'"'), ("»", r'"'), ] HANDLE_PSEUDO_SPACES = [ # lines 59 - 67 ("\u00A0%", r"%"), ("nº\u00A0", "nº "), ("\u00A0:", r":"), ("\u00A0ºC", " ºC"), ("\u00A0cm", r" cm"), ("\u00A0\\?", "?"), ("\u00A0\\!", "!"), ("\u00A0;", r";"), (",\u00A0", r", "), (r" +", r" "), ] EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')] DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [ (r',"', r'",'), (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence ] DE_ES_CZ_CS_FR = [ ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"), ] OTHER = [ ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"), ] # Regex substitutions from replace-unicode-punctuation.perl # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl REPLACE_UNICODE_PUNCTUATION = [ (",", ","), (r"。\s*", ". "), ("、", ","), ("”", '"'), ("“", '"'), ("∶", ":"), (":", ":"), ("?", "?"), ("《", '"'), ("》", '"'), (")", ")"), ("!", "!"), ("(", "("), (";", ";"), ("」", '"'), ("「", '"'), ("0", "0"), ("1", "1"), ("2", "2"), ("3", "3"), ("4", "4"), ("5", "5"), ("6", "6"), ("7", "7"), ("8", "8"), ("9", "9"), (r".\s*", ". "), ("~", "~"), ("’", "'"), ("…", "..."), ("━", "-"), ("〈", "<"), ("〉", ">"), ("【", "["), ("】", "]"), ("%", "%"), ] def __init__( self, lang="en", penn=True, norm_quote_commas=True, norm_numbers=True, pre_replace_unicode_punct=False, post_remove_control_chars=False, ): """ :param language: The two-letter language code. :type lang: str :param penn: Normalize Penn Treebank style quotations. :type penn: bool :param norm_quote_commas: Normalize quotations and commas :type norm_quote_commas: bool :param norm_numbers: Normalize numbers :type norm_numbers: bool """ self.substitutions = [ self.EXTRA_WHITESPACE, self.NORMALIZE_UNICODE, self.FRENCH_QUOTES, self.HANDLE_PSEUDO_SPACES, ] if penn: # Adds the penn substitutions after extra_whitespace regexes. self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN) if norm_quote_commas: if lang == "en": self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA) elif lang in ["de", "es", "fr"]: self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA) if norm_numbers: if lang in ["de", "es", "cz", "cs", "fr"]: self.substitutions.append(self.DE_ES_CZ_CS_FR) else: self.substitutions.append(self.OTHER) self.substitutions = list(chain(*self.substitutions)) self.pre_replace_unicode_punct = pre_replace_unicode_punct self.post_remove_control_chars = post_remove_control_chars def normalize(self, text): """ Returns a string with normalized punctuation. """ # Optionally, replace unicode puncts BEFORE normalization. if self.pre_replace_unicode_punct: text = self.replace_unicode_punct(text) # Actual normalization. for regexp, substitution in self.substitutions: # print(regexp, substitution) text = re.sub(regexp, substitution, str(text)) # print(text) # Optionally, replace unicode puncts BEFORE normalization. if self.post_remove_control_chars: text = self.remove_control_chars(text) return text.strip() def replace_unicode_punct(self, text): for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION: text = re.sub(regexp, substitution, str(text)) return text def remove_control_chars(self, text): return regex.sub(r"\p{C}", "", text) def _tokenization_norm(text): text = text.replace( ' ,', ',').replace( ' .', '.').replace( ' ?', '?').replace( ' !', '!').replace( ' ;', ';').replace( ' \'', '\'').replace( ' ’ ', '\'').replace( ' :', ':').replace( '', '\n').replace( '`` ', '"').replace( ' \'\'', '"').replace( '\'\'', '"').replace( '.. ', '... ').replace( ' )', ')').replace( '( ', '(').replace( ' n\'t', 'n\'t').replace( ' i ', ' I ').replace( ' i\'', ' I\'').replace( '\\\'', '\'').replace( '\n ', '\n').strip() return text def _clean_text(text): # remove PLM special tokens plm_special_tokens = r'(\)|(\)|(\<\/s\>)|(\)|(\<\|endoftext\|\>)' text = re.sub(plm_special_tokens, "", text) # normalize puncuations moses_norm = MosesPunctNormalizer() text = moses_norm.normalize(text) # normalize tokenization text = _tokenization_norm(text) # remove specific text patterns, e.g,, url, email and phone number text = clean(text, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=False, # lowercase text no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them no_urls=True, # replace all URLs with a special token no_emails=True, # replace all email addresses with a special token no_phone_numbers=True, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols=False, # replace all currency symbols with a special token no_punct=False, # remove punctuations replace_with_punct="", # instead of removing punctuations you may replace them replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="", replace_with_currency_symbol="", lang="en" # set to 'de' for German special handling ) # keep common puncts only punct_pattern = r'[^ A-Za-z0-9.?!,:;\-\[\]\{\}\(\)\'\"]' text = re.sub(punct_pattern, '', text) # remove specific patterns spe_pattern = r'[-\[\]\{\}\(\)\'\"]{2,}' text = re.sub(spe_pattern, '', text) # remove redundate spaces text = " ".join(text.split()) return text def _rm_line_break(text): text = text.replace("\n","\\n") text = re.sub(r'(?:\\n)*\\n', r'\\n', text) text = re.sub(r'^.{0,3}\\n', '', text) text = text.replace("\\n"," ") return text def preprocess(text): text = _rm_line_break(text) text = _clean_text(text) return text