File size: 2,664 Bytes
3d9c842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer

from textacy.preprocessing.remove import accents, brackets, punctuation
from textacy.preprocessing.replace import numbers, urls
from textacy.preprocessing.normalize import whitespace

import os

def clean_page(page):
    # given a page, removes heading, newlines, tabs, etc
    page = re.sub("=+", "", page)
    page = page.replace("\n", "")
    page = page.replace("\t", "")
    page = accents(brackets(page))
    page = urls(page)

    return whitespace(page).lower()

def clean_sentences(s):
        
    pattern = r'[^A-Za-z0-9]+'
    page = re.sub(pattern, '', s)
    return s


  
ps = PorterStemmer()
def prepare_document(doc):
    # given a document, preprocesses and tokenizes it for tfidf

    # clean the document of misc symbols and headings, lowercase it
    doc = clean_page(doc)

    #tokenize by sentence and then by word
    sentences = sent_tokenize(doc)

    #remove punctuation
    sentences = [punctuation(s) for s in sentences]


    # stem every word
    sentences_and_words = [word_tokenize(s) for s in sentences]

    prepared_doc = []
    
    for sent in sentences_and_words:
        stemmed_sentences = []
        for word in sent:
            stemmed_sentences.append(ps.stem(word))
        cleaned_sentence = " ".join(stemmed_sentences)
        prepared_doc.append(cleaned_sentence)
    return " ".join(prepared_doc)


# small function to calculats cosine similarity of all pairs and store
def cosine_similarity(v1, v2):
    numerator = np.dot(v1, v2)
    denom = np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2)))

    return numerator/denom 


def cos_dicts(names, vects):

    #given a set of vectors, create a dict of dicts for cosine similarity
    # This dict of dict structure allows us to index directly into the pair we want
    # The first key will be our desired game
    # and the value for that key will be a dictionary of partner games

    # The inner key will be the second game we wish to seek, and its value will be cosine similarity to our first game

    d = {}
    for name, vect in zip(names, vects):
        cos_sim_by_vect = {}
        for n2, v2 in zip(names, vects):
            if n2 != name:
                cos_sim_by_vect[n2] = cosine_similarity(vect, v2)
        d[name] = cos_sim_by_vect
    return d

def retrieve_top_k_similar(n1, similarity_dict, k):
    inner_dict = similarity_dict[n1]
    # sort the dictionary by value, descending, then retrieve top k values
    return sorted(inner_dict.items(), reverse = True, key = lambda x: x[1])[:k]