Spaces:
Runtime error
Runtime error
arjunpatel
commited on
Commit
•
3d9c842
1
Parent(s):
b688d81
Upload requirements and script
Browse files- data_cleaning.py +90 -0
- requirements.txt +5 -0
data_cleaning.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
|
6 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
7 |
+
from nltk.stem import PorterStemmer
|
8 |
+
|
9 |
+
from textacy.preprocessing.remove import accents, brackets, punctuation
|
10 |
+
from textacy.preprocessing.replace import numbers, urls
|
11 |
+
from textacy.preprocessing.normalize import whitespace
|
12 |
+
|
13 |
+
import os
|
14 |
+
|
15 |
+
def clean_page(page):
|
16 |
+
# given a page, removes heading, newlines, tabs, etc
|
17 |
+
page = re.sub("=+", "", page)
|
18 |
+
page = page.replace("\n", "")
|
19 |
+
page = page.replace("\t", "")
|
20 |
+
page = accents(brackets(page))
|
21 |
+
page = urls(page)
|
22 |
+
|
23 |
+
return whitespace(page).lower()
|
24 |
+
|
25 |
+
def clean_sentences(s):
|
26 |
+
|
27 |
+
pattern = r'[^A-Za-z0-9]+'
|
28 |
+
page = re.sub(pattern, '', s)
|
29 |
+
return s
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
ps = PorterStemmer()
|
34 |
+
def prepare_document(doc):
|
35 |
+
# given a document, preprocesses and tokenizes it for tfidf
|
36 |
+
|
37 |
+
# clean the document of misc symbols and headings, lowercase it
|
38 |
+
doc = clean_page(doc)
|
39 |
+
|
40 |
+
#tokenize by sentence and then by word
|
41 |
+
sentences = sent_tokenize(doc)
|
42 |
+
|
43 |
+
#remove punctuation
|
44 |
+
sentences = [punctuation(s) for s in sentences]
|
45 |
+
|
46 |
+
|
47 |
+
# stem every word
|
48 |
+
sentences_and_words = [word_tokenize(s) for s in sentences]
|
49 |
+
|
50 |
+
prepared_doc = []
|
51 |
+
|
52 |
+
for sent in sentences_and_words:
|
53 |
+
stemmed_sentences = []
|
54 |
+
for word in sent:
|
55 |
+
stemmed_sentences.append(ps.stem(word))
|
56 |
+
cleaned_sentence = " ".join(stemmed_sentences)
|
57 |
+
prepared_doc.append(cleaned_sentence)
|
58 |
+
return " ".join(prepared_doc)
|
59 |
+
|
60 |
+
|
61 |
+
# small function to calculats cosine similarity of all pairs and store
|
62 |
+
def cosine_similarity(v1, v2):
|
63 |
+
numerator = np.dot(v1, v2)
|
64 |
+
denom = np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2)))
|
65 |
+
|
66 |
+
return numerator/denom
|
67 |
+
|
68 |
+
|
69 |
+
def cos_dicts(names, vects):
|
70 |
+
|
71 |
+
#given a set of vectors, create a dict of dicts for cosine similarity
|
72 |
+
# This dict of dict structure allows us to index directly into the pair we want
|
73 |
+
# The first key will be our desired game
|
74 |
+
# and the value for that key will be a dictionary of partner games
|
75 |
+
|
76 |
+
# The inner key will be the second game we wish to seek, and its value will be cosine similarity to our first game
|
77 |
+
|
78 |
+
d = {}
|
79 |
+
for name, vect in zip(names, vects):
|
80 |
+
cos_sim_by_vect = {}
|
81 |
+
for n2, v2 in zip(names, vects):
|
82 |
+
if n2 != name:
|
83 |
+
cos_sim_by_vect[n2] = cosine_similarity(vect, v2)
|
84 |
+
d[name] = cos_sim_by_vect
|
85 |
+
return d
|
86 |
+
|
87 |
+
def retrieve_top_k_similar(n1, similarity_dict, k):
|
88 |
+
inner_dict = similarity_dict[n1]
|
89 |
+
# sort the dictionary by value, descending, then retrieve top k values
|
90 |
+
return sorted(inner_dict.items(), reverse = True, key = lambda x: x[1])[:k]
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
sentence_transformers
|
3 |
+
datasets
|
4 |
+
scikit-learn
|
5 |
+
torch
|