parser / udpipe2 /udpipe2_dataset.py
anasampa2's picture
Upload 151 files
ee0ec3d verified
raw
history blame
No virus
20.5 kB
# This file is part of UDPipe 2 <http://github.com/ufal/udpipe>.
#
# Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import io
import pickle
import re
import sys
import numpy as np
__version__ = "2.1.1-dev"
class UDPipe2Dataset:
FORMS = 0
LEMMAS = 1
UPOS = 2
XPOS = 3
FEATS = 4
HEAD = 5
DEPREL = 6
DEPS = 7
MISC = 8
FACTORS = 9
VARIANT = 9
EMBEDDINGS = 10
FACTORS_MAP = {"FORMS": FORMS, "LEMMAS": LEMMAS, "UPOS": UPOS, "XPOS": XPOS, "FEATS": FEATS,
"HEAD": HEAD, "DEPREL": DEPREL, "DEPS": DEPS, "MISC": MISC}
re_extras = re.compile(r"^#|^\d+-|^\d+\.")
re_variant = re.compile(r"^#\s*variant\s*=\s*(\S+)")
class _Factor:
ROOT = 2
def __init__(self, with_root, characters, train=None):
self.words_map = train.words_map if train else {'<pad>': 0, '<unk>': 1, '<root>': 2}
self.words = train.words if train else ['<pad>', '<unk>', '<root>']
self.word_ids = []
self.strings = []
self.with_root = with_root
self.characters = characters
if characters:
self.alphabet_map = train.alphabet_map if train else {'<pad>': 0, '<unk>': 1, '<root>': 2}
self.alphabet = train.alphabet if train else ['<pad>', '<unk>', '<root>']
self.charseqs_map = {'<pad>': 0, '<unk>': 1, '<root>': 2}
self.charseqs = [[0], [1], [2]]
self.charseq_ids = []
def __init__(self, path=None, text=None, embeddings=[], train=None, shuffle_batches=True,
override_variant=None, max_sentence_len=None, max_sentences=None):
# Create factors and other variables
self._factors = []
for f in range(self.FACTORS):
self._factors.append(self._Factor(f == self.FORMS, f == self.FORMS, train._factors[f] if train else None))
self._extras = []
form_dict = {}
self._lr_allow_copy = train._lr_allow_copy if train else None
lemma_dict_with_copy, lemma_dict_no_copy = {}, {}
self._variant_map = train._variant_map if train else {}
self._variants = []
# Load contextualized embeddings
if isinstance(embeddings, list) and all(isinstance(embedding, np.ndarray) for embedding in embeddings):
self._embeddings = embeddings
else:
self._embeddings = []
for embeddings_path in embeddings:
with np.load(embeddings_path, allow_pickle=True) as embeddings_file:
for i, (_, value) in enumerate(embeddings_file.items()):
if max_sentence_len: value = value[:max_sentence_len]
if i >= len(self._embeddings): self._embeddings.append(value)
else: self._embeddings[i] = np.concatenate([self._embeddings[i], value], axis=1)
assert i + 1 == len(self._embeddings)
self._embeddings_size = self._embeddings[0].shape[1] if self._embeddings else 0
# Load the sentences
with open(path, "r", encoding="utf-8") if path is not None else io.StringIO(text) as file:
in_sentence = False
variant = ""
for line in file:
line = line.rstrip("\r\n")
if line:
if self.re_extras.match(line):
variant_match = self.re_variant.match(line)
if variant_match:
variant = variant_match.group(1)
if in_sentence:
while len(self._extras) < len(self._factors[0].word_ids): self._extras.append([])
while len(self._extras[-1]) <= len(self._factors[0].word_ids[-1]) - self._factors[0].with_root:
self._extras[-1].append("")
else:
while len(self._extras) <= len(self._factors[0].word_ids): self._extras.append([])
if not len(self._extras[-1]): self._extras[-1].append("")
self._extras[-1][-1] += ("\n" if self._extras[-1][-1] else "") + line
continue
if max_sentence_len and in_sentence and len(self._factors[0].strings[-1]) - self._factors[0].with_root >= max_sentence_len:
continue
columns = line.split("\t")[1:]
for f in range(self.FACTORS):
factor = self._factors[f]
if not in_sentence:
if len(factor.word_ids): factor.word_ids[-1] = np.array(factor.word_ids[-1], np.int32)
factor.word_ids.append([])
factor.strings.append([])
if factor.characters: factor.charseq_ids.append([])
if factor.with_root:
factor.word_ids[-1].append(factor.ROOT)
factor.strings[-1].append(factor.words[factor.ROOT])
if factor.characters: factor.charseq_ids[-1].append(factor.ROOT)
word = columns[f]
factor.strings[-1].append(word)
# Preprocess word
if f == self.LEMMAS and self._lr_allow_copy is not None:
word = self._gen_lemma_rule(columns[self.FORMS], columns[self.LEMMAS], self._lr_allow_copy)
# Character-level information
if factor.characters:
if word not in factor.charseqs_map:
factor.charseqs_map[word] = len(factor.charseqs)
factor.charseqs.append([])
for c in word:
if c not in factor.alphabet_map:
if train:
c = '<unk>'
else:
factor.alphabet_map[c] = len(factor.alphabet)
factor.alphabet.append(c)
factor.charseqs[-1].append(factor.alphabet_map[c])
factor.charseq_ids[-1].append(factor.charseqs_map[word])
# Word-level information
if f == self.HEAD:
factor.word_ids[-1].append(int(word) if word != "_" else -1)
elif f == self.FORMS and not train:
factor.word_ids[-1].append(0)
form_dict[word] = form_dict.get(word, 0) + 1
elif f == self.LEMMAS and self._lr_allow_copy is None:
factor.word_ids[-1].append(0)
lemma_dict_with_copy[self._gen_lemma_rule(columns[self.FORMS], word, True)] = 1
lemma_dict_no_copy[self._gen_lemma_rule(columns[self.FORMS], word, False)] = 1
else:
if word not in factor.words_map:
if train:
word = '<unk>'
else:
factor.words_map[word] = len(factor.words)
factor.words.append(word)
factor.word_ids[-1].append(factor.words_map[word])
if not in_sentence:
if override_variant is not None: variant = override_variant
if (variant not in self._variant_map) and (not train):
self._variant_map[variant] = len(self._variant_map)
self._variants.append(self._variant_map.get(variant, 0))
in_sentence = True
else:
in_sentence = False
if max_sentences is not None and len(self._factors[self.FORMS].word_ids) >= max_sentences:
break
# Finalize forms if needed
if not train:
forms = self._factors[self.FORMS]
for i in range(len(forms.word_ids)):
for j in range(forms.with_root, len(forms.word_ids[i])):
word = "<unk>" if form_dict[forms.strings[i][j]] < 2 else forms.strings[i][j]
if word not in forms.words_map:
forms.words_map[word] = len(forms.words)
forms.words.append(word)
forms.word_ids[i][j] = forms.words_map[word]
# Finalize lemmas if needed
if self._lr_allow_copy is None:
self._lr_allow_copy = True if len(lemma_dict_with_copy) < len(lemma_dict_no_copy) else False
lemmas = self._factors[self.LEMMAS]
for i in range(len(lemmas.word_ids)):
for j in range(lemmas.with_root, len(lemmas.word_ids[i])):
word = self._gen_lemma_rule(self._factors[self.FORMS].strings[i][j - lemmas.with_root + self._factors[self.FORMS].with_root],
lemmas.strings[i][j], self._lr_allow_copy)
if word not in lemmas.words_map:
lemmas.words_map[word] = len(lemmas.words)
lemmas.words.append(word)
lemmas.word_ids[i][j] = lemmas.words_map[word]
# Compute sentence lengths
sentences = len(self._factors[self.FORMS].word_ids)
self._sentence_lens = np.zeros([sentences], np.int32)
for i in range(len(self._factors[self.FORMS].word_ids)):
self._sentence_lens[i] = len(self._factors[self.FORMS].word_ids[i]) - self._factors[self.FORMS].with_root
self._shuffle_batches = shuffle_batches
self._permutation = np.random.permutation(len(self._sentence_lens)) if self._shuffle_batches else np.arange(len(self._sentence_lens))
if self._embeddings:
assert sentences == len(self._embeddings)
for i in range(sentences):
assert self._sentence_lens[i] == len(self._embeddings[i]), "{} {} {}".format(i, self._sentence_lens[i], len(self._embeddings[i]))
@property
def sentence_lens(self):
return self._sentence_lens
@property
def factors(self):
return self._factors
@property
def variants(self):
return len(self._variant_map)
@property
def embeddings_size(self):
return self._embeddings_size
def save_mappings(self, path):
mappings = UDPipe2Dataset.__new__(UDPipe2Dataset)
for field in ["_lr_allow_copy", "_variant_map", "_embeddings_size"]:
setattr(mappings, field, getattr(self, field))
mappings._factors = []
for factor in self._factors:
mappings._factors.append(mappings._Factor(factor.with_root, factor.characters, factor))
with open(path, "wb") as mappings_file:
pickle.dump(mappings, mappings_file, protocol=3)
@staticmethod
def load_mappings(path):
with open(path, "rb") as mappings_file:
return pickle.load(mappings_file)
def epoch_finished(self):
if len(self._permutation) == 0:
self._permutation = np.random.permutation(len(self._sentence_lens)) if self._shuffle_batches else np.arange(len(self._sentence_lens))
return True
return False
def next_batch(self, batch_size, max_form_length=64):
batch_size = min(batch_size, len(self._permutation))
batch_perm = self._permutation[:batch_size]
self._permutation = self._permutation[batch_size:]
# General data
batch_sentence_lens = self._sentence_lens[batch_perm]
max_sentence_len = np.max(batch_sentence_lens)
# Word-level data
batch_word_ids = []
for factor in self._factors:
batch_word_ids.append(np.zeros([batch_size, max_sentence_len + factor.with_root], np.int32))
for i in range(batch_size):
batch_word_ids[-1][i, 0:batch_sentence_lens[i] + factor.with_root] = factor.word_ids[batch_perm[i]]
# Variants
batch_word_ids.append(np.zeros([batch_size], np.int32))
for i in range(batch_size):
batch_word_ids[-1][i] = self._variants[batch_perm[i]]
# Contextualized embeddings
if self._embeddings:
forms = self._factors[self.FORMS]
batch_word_ids.append(np.zeros([batch_size, max_sentence_len + forms.with_root, self.embeddings_size], np.float16))
for i in range(batch_size):
batch_word_ids[-1][i, forms.with_root:forms.with_root + len(self._embeddings[batch_perm[i]])] = \
self._embeddings[batch_perm[i]]
# Character-level data
batch_charseq_ids, batch_charseqs, batch_charseq_lens = [], [], []
for factor in self._factors:
if not factor.characters:
batch_charseq_ids.append([])
batch_charseqs.append([])
batch_charseq_lens.append([])
continue
batch_charseq_ids.append(np.zeros([batch_size, max_sentence_len + factor.with_root], np.int32))
charseqs_map = {}
charseqs = []
charseq_lens = []
for i in range(batch_size):
for j, charseq_id in enumerate(factor.charseq_ids[batch_perm[i]]):
if charseq_id not in charseqs_map:
charseqs_map[charseq_id] = len(charseqs)
charseqs.append(factor.charseqs[charseq_id][:max_form_length])
batch_charseq_ids[-1][i, j] = charseqs_map[charseq_id]
batch_charseq_lens.append(np.array([len(charseq) for charseq in charseqs], np.int32))
batch_charseqs.append(np.zeros([len(charseqs), np.max(batch_charseq_lens[-1])], np.int32))
for i in range(len(charseqs)):
batch_charseqs[-1][i, 0:len(charseqs[i])] = charseqs[i]
return self._sentence_lens[batch_perm], batch_word_ids, batch_charseq_ids, batch_charseqs, batch_charseq_lens
def write_sentence(self, output, index, overrides):
for i in range(self._sentence_lens[index] + 1):
# Start by writing extras
if index < len(self._extras) and i < len(self._extras[index]) and self._extras[index][i]:
print(self._extras[index][i], file=output)
if i == self._sentence_lens[index]: break
fields = []
fields.append(str(i + 1))
for f in range(self.FACTORS):
factor = self._factors[f]
offset = i + factor.with_root
field = factor.strings[index][offset]
# Overrides
if overrides is not None and f < len(overrides) and overrides[f] is not None:
override = overrides[f][offset]
if f == self.HEAD:
field = str(override) if override >= 0 else "_"
elif (f == self.LEMMAS or f == self.XPOS) and isinstance(override, str):
field = override
else:
field = factor.words[override]
if f == self.LEMMAS:
try:
field = self._apply_lemma_rule(fields[-1], field)
except:
print("Applying lemma rule failed for form '{}' and rule '{}', using the form as lemma".format(
fields[-1], field), file=sys.stderr)
field = fields[-1]
# Do not generate empty lemmas
field = field or fields[-1]
fields.append(field)
print("\t".join(fields), file=output)
print(file=output)
@staticmethod
def _min_edit_script(source, target, allow_copy):
a = [[(len(source) + len(target) + 1, None)] * (len(target) + 1) for _ in range(len(source) + 1)]
for i in range(0, len(source) + 1):
for j in range(0, len(target) + 1):
if i == 0 and j == 0:
a[i][j] = (0, "")
else:
if allow_copy and i and j and source[i - 1] == target[j - 1] and a[i-1][j-1][0] < a[i][j][0]:
a[i][j] = (a[i-1][j-1][0], a[i-1][j-1][1] + "→")
if i and a[i-1][j][0] < a[i][j][0]:
a[i][j] = (a[i-1][j][0] + 1, a[i-1][j][1] + "-")
if j and a[i][j-1][0] < a[i][j][0]:
a[i][j] = (a[i][j-1][0] + 1, a[i][j-1][1] + "+" + target[j - 1])
return a[-1][-1][1]
@staticmethod
def _gen_lemma_rule(form, lemma, allow_copy):
form = form.lower()
previous_case = -1
lemma_casing = ""
for i, c in enumerate(lemma):
case = "↑" if c.lower() != c else "↓"
if case != previous_case:
lemma_casing += "{}{}{}".format("¦" if lemma_casing else "", case, i if i <= len(lemma) // 2 else i - len(lemma))
previous_case = case
lemma = lemma.lower()
best, best_form, best_lemma = 0, 0, 0
for l in range(len(lemma)):
for f in range(len(form)):
cpl = 0
while f + cpl < len(form) and l + cpl < len(lemma) and form[f + cpl] == lemma[l + cpl]: cpl += 1
if cpl > best:
best = cpl
best_form = f
best_lemma = l
rule = lemma_casing + ";"
if not best:
rule += "a" + lemma
else:
rule += "d{}¦{}".format(
UDPipe2Dataset._min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy),
UDPipe2Dataset._min_edit_script(form[best_form + best:], lemma[best_lemma + best:], allow_copy),
)
return rule
@staticmethod
def _apply_lemma_rule(form, lemma_rule):
casing, rule = lemma_rule.split(";", 1)
if rule.startswith("a"):
lemma = rule[1:]
else:
form = form.lower()
rules, rule_sources = rule[1:].split("¦"), []
assert len(rules) == 2
for rule in rules:
source, i = 0, 0
while i < len(rule):
if rule[i] == "→" or rule[i] == "-":
source += 1
else:
assert rule[i] == "+"
i += 1
i += 1
rule_sources.append(source)
try:
lemma, form_offset = "", 0
for i in range(2):
j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
while j < len(rules[i]):
if rules[i][j] == "→":
lemma += form[offset]
offset += 1
elif rules[i][j] == "-":
offset += 1
else:
assert(rules[i][j] == "+")
lemma += rules[i][j + 1]
j += 1
j += 1
if i == 0:
lemma += form[rule_sources[0] : len(form) - rule_sources[1]]
except:
lemma = form
for rule in casing.split("¦"):
if rule == "↓0": continue # The lemma is lowercased initially
if not rule: continue # Empty lemma might generate empty casing rule
case, offset = rule[0], int(rule[1:])
lemma = lemma[:offset] + (lemma[offset:].upper() if case == "↑" else lemma[offset:].lower())
return lemma