Spaces:
Runtime error
Runtime error
import re | |
import os | |
import unicodedata | |
import nltk | |
import inflect | |
from nltk import word_tokenize, sent_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import LancasterStemmer, WordNetLemmatizer | |
# download_path = os.path.join(os.getcwd(), 'nltk_packages') | |
# nltk.data.path.append(download_path) | |
nltk.download('wordnet') | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
def remove_non_ascii(words): | |
"""Remove non-ASCII characters from list of tokenized words""" | |
new_words = [] | |
for word in words: | |
new_word = unicodedata.normalize('NFKD', word).encode( | |
'ascii', 'ignore').decode('utf-8', 'ignore') | |
new_words.append(new_word) | |
return new_words | |
def to_lowercase(words): | |
"""Convert all characters to lowercase from list of tokenized words""" | |
new_words = [] | |
for word in words: | |
new_word = word.lower() | |
new_words.append(new_word) | |
return new_words | |
def remove_punctuation(words): | |
"""Remove punctuation from list of tokenized words""" | |
new_words = [] | |
for word in words: | |
new_word = re.sub(r'[^\w\s]', '', word) | |
if new_word != '': | |
new_words.append(new_word) | |
return new_words | |
def replace_numbers(words): | |
"""Replace all interger occurrences in list of tokenized words with textual representation""" | |
p = inflect.engine() | |
new_words = [] | |
for word in words: | |
if word.isdigit(): | |
new_word = p.number_to_words(word) | |
new_words.append(new_word) | |
else: | |
new_words.append(word) | |
return new_words | |
def remove_stopwords(words): | |
"""Remove stop words from list of tokenized words""" | |
new_words = [] | |
for word in words: | |
# print(word) | |
if word not in stopwords.words('english'): | |
new_words.append(word) | |
return new_words | |
def stem_words(words): | |
"""Stem words in list of tokenized words""" | |
stemmer = LancasterStemmer() | |
stems = [] | |
for word in words: | |
stem = stemmer.stem(word) | |
stems.append(stem) | |
return stems | |
def lemmatize_verbs(words): | |
"""Lemmatize verbs in list of tokenized words""" | |
lemmatizer = WordNetLemmatizer() | |
lemmas = [] | |
for word in words: | |
lemma = lemmatizer.lemmatize(word, pos='v') | |
lemmas.append(lemma) | |
return lemmas | |
def normalize(words): | |
words = remove_non_ascii(words) | |
words = to_lowercase(words) | |
words = remove_punctuation(words) | |
# words = replace_numbers(words) | |
words = remove_stopwords(words) | |
# words = stem_words(words) | |
# words = lemmatize_verbs(words) | |
return words | |
def preprocess(documents): | |
preprocessed_documents = [] | |
for document in documents: | |
tokens = nltk.word_tokenize(document) | |
preprocessed = normalize(tokens) | |
preprocessed = ' '.join(map(str, preprocessed)) | |
preprocessed_documents.append(preprocessed) | |
return preprocessed_documents | |