Spaces:
Runtime error
Runtime error
import re | |
import torch | |
from cleantext import clean | |
from itertools import chain | |
class MosesPunctNormalizer: | |
""" | |
This is a Python port of the Moses punctuation normalizer from | |
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl | |
""" | |
EXTRA_WHITESPACE = [ # lines 21 - 30 | |
(r"\r", r""), | |
(r"\(", r" ("), | |
(r"\)", r") "), | |
(r" +", r" "), | |
(r"\) ([.!:?;,])", r")\g<1>"), | |
(r"\( ", r"("), | |
(r" \)", r")"), | |
(r"(\d) %", r"\g<1>%"), | |
(r" :", r":"), | |
(r" ;", r";"), | |
] | |
NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34 | |
NORMALIZE_UNICODE = [ # lines 37 - 50 | |
("„", r'"'), | |
("“", r'"'), | |
("”", r'"'), | |
("–", r"-"), | |
("—", r" - "), | |
(r" +", r" "), | |
("´", r"'"), | |
("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"), | |
("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"), | |
("‘", r"'"), | |
("‚", r"'"), | |
("’", r"'"), | |
(r"''", r'"'), | |
("´´", r'"'), | |
("…", r"..."), | |
] | |
FRENCH_QUOTES = [ # lines 52 - 57 | |
("\u00A0«\u00A0", r'"'), | |
("«\u00A0", r'"'), | |
("«", r'"'), | |
("\u00A0»\u00A0", r'"'), | |
("\u00A0»", r'"'), | |
("»", r'"'), | |
] | |
HANDLE_PSEUDO_SPACES = [ # lines 59 - 67 | |
("\u00A0%", r"%"), | |
("nº\u00A0", "nº "), | |
("\u00A0:", r":"), | |
("\u00A0ºC", " ºC"), | |
("\u00A0cm", r" cm"), | |
("\u00A0\\?", "?"), | |
("\u00A0\\!", "!"), | |
("\u00A0;", r";"), | |
(",\u00A0", r", "), | |
(r" +", r" "), | |
] | |
EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')] | |
DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [ | |
(r',"', r'",'), | |
(r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence | |
] | |
DE_ES_CZ_CS_FR = [ | |
("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"), | |
] | |
OTHER = [ | |
("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"), | |
] | |
# Regex substitutions from replace-unicode-punctuation.perl | |
# https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl | |
REPLACE_UNICODE_PUNCTUATION = [ | |
(",", ","), | |
(r"。\s*", ". "), | |
("、", ","), | |
("”", '"'), | |
("“", '"'), | |
("∶", ":"), | |
(":", ":"), | |
("?", "?"), | |
("《", '"'), | |
("》", '"'), | |
(")", ")"), | |
("!", "!"), | |
("(", "("), | |
(";", ";"), | |
("」", '"'), | |
("「", '"'), | |
("0", "0"), | |
("1", "1"), | |
("2", "2"), | |
("3", "3"), | |
("4", "4"), | |
("5", "5"), | |
("6", "6"), | |
("7", "7"), | |
("8", "8"), | |
("9", "9"), | |
(r".\s*", ". "), | |
("~", "~"), | |
("’", "'"), | |
("…", "..."), | |
("━", "-"), | |
("〈", "<"), | |
("〉", ">"), | |
("【", "["), | |
("】", "]"), | |
("%", "%"), | |
] | |
def __init__( | |
self, | |
lang="en", | |
penn=True, | |
norm_quote_commas=True, | |
norm_numbers=True, | |
pre_replace_unicode_punct=False, | |
post_remove_control_chars=False, | |
): | |
""" | |
:param language: The two-letter language code. | |
:type lang: str | |
:param penn: Normalize Penn Treebank style quotations. | |
:type penn: bool | |
:param norm_quote_commas: Normalize quotations and commas | |
:type norm_quote_commas: bool | |
:param norm_numbers: Normalize numbers | |
:type norm_numbers: bool | |
""" | |
self.substitutions = [ | |
self.EXTRA_WHITESPACE, | |
self.NORMALIZE_UNICODE, | |
self.FRENCH_QUOTES, | |
self.HANDLE_PSEUDO_SPACES, | |
] | |
if penn: # Adds the penn substitutions after extra_whitespace regexes. | |
self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN) | |
if norm_quote_commas: | |
if lang == "en": | |
self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA) | |
elif lang in ["de", "es", "fr"]: | |
self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA) | |
if norm_numbers: | |
if lang in ["de", "es", "cz", "cs", "fr"]: | |
self.substitutions.append(self.DE_ES_CZ_CS_FR) | |
else: | |
self.substitutions.append(self.OTHER) | |
self.substitutions = list(chain(*self.substitutions)) | |
self.pre_replace_unicode_punct = pre_replace_unicode_punct | |
self.post_remove_control_chars = post_remove_control_chars | |
def normalize(self, text): | |
""" | |
Returns a string with normalized punctuation. | |
""" | |
# Optionally, replace unicode puncts BEFORE normalization. | |
if self.pre_replace_unicode_punct: | |
text = self.replace_unicode_punct(text) | |
# Actual normalization. | |
for regexp, substitution in self.substitutions: | |
# print(regexp, substitution) | |
text = re.sub(regexp, substitution, str(text)) | |
# print(text) | |
# Optionally, replace unicode puncts BEFORE normalization. | |
if self.post_remove_control_chars: | |
text = self.remove_control_chars(text) | |
return text.strip() | |
def replace_unicode_punct(self, text): | |
for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION: | |
text = re.sub(regexp, substitution, str(text)) | |
return text | |
def remove_control_chars(self, text): | |
return regex.sub(r"\p{C}", "", text) | |
def _tokenization_norm(text): | |
text = text.replace( | |
' ,', ',').replace( | |
' .', '.').replace( | |
' ?', '?').replace( | |
' !', '!').replace( | |
' ;', ';').replace( | |
' \'', '\'').replace( | |
' ’ ', '\'').replace( | |
' :', ':').replace( | |
'<newline>', '\n').replace( | |
'`` ', '"').replace( | |
' \'\'', '"').replace( | |
'\'\'', '"').replace( | |
'.. ', '... ').replace( | |
' )', ')').replace( | |
'( ', '(').replace( | |
' n\'t', 'n\'t').replace( | |
' i ', ' I ').replace( | |
' i\'', ' I\'').replace( | |
'\\\'', '\'').replace( | |
'\n ', '\n').strip() | |
return text | |
def _clean_text(text): | |
# remove PLM special tokens | |
plm_special_tokens = r'(\<pad\>)|(\<s\>)|(\<\/s\>)|(\<unk\>)|(\<\|endoftext\|\>)' | |
text = re.sub(plm_special_tokens, "", text) | |
# normalize puncuations | |
moses_norm = MosesPunctNormalizer() | |
text = moses_norm.normalize(text) | |
# normalize tokenization | |
text = _tokenization_norm(text) | |
# remove specific text patterns, e.g,, url, email and phone number | |
text = clean(text, | |
fix_unicode=True, # fix various unicode errors | |
to_ascii=True, # transliterate to closest ASCII representation | |
lower=False, # lowercase text | |
no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them | |
no_urls=True, # replace all URLs with a special token | |
no_emails=True, # replace all email addresses with a special token | |
no_phone_numbers=True, # replace all phone numbers with a special token | |
no_numbers=False, # replace all numbers with a special token | |
no_digits=False, # replace all digits with a special token | |
no_currency_symbols=False, # replace all currency symbols with a special token | |
no_punct=False, # remove punctuations | |
replace_with_punct="", # instead of removing punctuations you may replace them | |
replace_with_url="", | |
replace_with_email="", | |
replace_with_phone_number="", | |
replace_with_number="<NUMBER>", | |
replace_with_digit="<DIGIT>", | |
replace_with_currency_symbol="<CUR>", | |
lang="en" # set to 'de' for German special handling | |
) | |
# keep common puncts only | |
punct_pattern = r'[^ A-Za-z0-9.?!,:;\-\[\]\{\}\(\)\'\"]' | |
text = re.sub(punct_pattern, '', text) | |
# remove specific patterns | |
spe_pattern = r'[-\[\]\{\}\(\)\'\"]{2,}' | |
text = re.sub(spe_pattern, '', text) | |
# remove redundate spaces | |
text = " ".join(text.split()) | |
return text | |
def _rm_line_break(text): | |
text = text.replace("\n","\\n") | |
text = re.sub(r'(?:\\n)*\\n', r'\\n', text) | |
text = re.sub(r'^.{0,3}\\n', '', text) | |
text = text.replace("\\n"," ") | |
return text | |
def preprocess(text): | |
text = _rm_line_break(text) | |
text = _clean_text(text) | |
return text | |