|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def term_document_matrix (df, vocab, document_index, text): |
|
|
|
vocab_index = pd.DataFrame(columns=df[document_index], index=vocab).fillna(0) |
|
|
|
for word in vocab_index.index: |
|
for doc in df[document_index]: |
|
freq = df[df[document_index]==doc][text].values[0].count(word) |
|
vocab_index.loc[word,doc]=freq |
|
return vocab_index |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tf_idf_score(vocab_index, document_index, inv_df='inverse_document_frequency'): |
|
|
|
total_docx=len(document_index) |
|
vocab_index['document_frequency']=vocab_index.sum(axis=1) |
|
vocab_index['inverse_document_frequency']=np.log2(total_docx/vocab_index['document_frequency']) |
|
|
|
for word in vocab_index.index: |
|
|
|
for doc in document_index: |
|
|
|
tf_idf=np.log2(1+vocab_index.loc[word,doc]) * np.log2(vocab_index.loc[word][inv_df]) |
|
vocab_index.loc[word]['tf_idf_'+str(doc)]=tf_idf |
|
|
|
return vocab_index |
|
|
|
|
|
|
|
|
|
|
|
def generar_archivo_indexado(): |
|
|
|
|
|
|
|
return 0 |