File size: 2,984 Bytes
6cf191b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
#############################
# Imports
#############################
# Python modules
import re
import string
# Remote modules
# Local modules
from utils import (
read_simple_text_file_2_vec
)
#############################
# Utils
#############################
class ParsingUtils:
STOPWORDS = read_simple_text_file_2_vec('english_stopwords.txt', store_dir='kgs_binding')
@staticmethod
def remove_pontuation(text):
text = re.sub(r"[^a-zA-Z]", " ", text)
return text.translate(str.maketrans('', '', string.punctuation))
@staticmethod
def clear_common_words(index_with_words):
return [(word, (s, e)) for (word, (s, e)) in index_with_words if word not in ParsingUtils.STOPWORDS]
@staticmethod
def is_word_a_relevant_one(ignore_common_words, word):
if ignore_common_words:
return word not in ParsingUtils.STOPWORDS
else:
return True
@staticmethod
def get_word_range_mapping(context, word_token):
word_token_splitted = word_token.split(' ')
if len(word_token_splitted) == 1:
word_token_start = context.index(word_token)
word_token_end = word_token_start + len(word_token) - 1 # inclusive end
else:
word_token_start = context.index(word_token_splitted[0])
word_token_end = word_token_start + len(word_token) - 1 # inclusive end
return word_token_start, word_token_end
@staticmethod
def n_grams(words_vector, n):
grams = [words_vector[i:i + n] for i in range(len(words_vector) - n + 1)]
print(grams)
return [' '.join(x) for x in grams]
@staticmethod
def n_grams_with_idx(words_vector, n):
grams = [words_vector[i:i + n] for i in range(len(words_vector) - n + 1)]
return [(' '.join([pair[0] for pair in x]), (x[0][1], x[-1][1]+len(x[-1][0]))) for x in grams]
@staticmethod
def n_grams_context_producer_simple(context, n_gram=2):
context_tokens = context.strip().split(' ')
#context_tokens = [w for w in context_tokens if w not in STOPWORDS]
n_grams_context = []
for i in range(n_gram):
n_gram_content = ParsingUtils.n_grams(context_tokens, n_gram-i)
n_grams_context.append(n_gram_content)
return n_grams_context
@staticmethod
def n_grams_n_words_extractor(context, n_gram=3):
context_tokens = context.strip().split(' ')
context_tokens_with_index_info=[]
word_idx=0
for word in context_tokens:
context_tokens_with_index_info.append((word, word_idx))
word_idx += len(word) + 1
#context_tokens = [w for w in context_tokens if w not in STOPWORDS]
n_grams_context = []
for i in range(n_gram):
n_gram_content = ParsingUtils.n_grams_with_idx(context_tokens_with_index_info, n_gram-i)
n_grams_context.extend(n_gram_content)
return n_grams_context
|