Upload 12 files
Browse files- .gitattributes +1 -0
- README.md +17 -12
- app.py +72 -0
- articulos.csv +3 -0
- articulos_indexados.csv +0 -0
- articulos_titulos_bow.csv +0 -0
- carga_articulos.py +14 -0
- config.toml +4 -0
- entrenamiento_modelo.py +50 -0
- preprocesamiento_articulos.py +38 -0
- repartidor_periodicos.jpeg +0 -0
- requirements.txt +9 -0
- resultados_consulta.py +80 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
articulos.csv filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Personal Intelligent Agent: This is Your Personal Intelligent Agent for doing research work, answering questions, and solving problems
|
2 |
+
|
3 |
+
Steps:
|
4 |
+
0. Search API:
|
5 |
+
serper api free credits:
|
6 |
+
https://serper.dev/dashboard
|
7 |
+
|
8 |
+
1. Test on portal:
|
9 |
+
pip install streamlit
|
10 |
+
streamlit run app.py
|
11 |
+
|
12 |
+
|
13 |
+
2. Test on API:
|
14 |
+
run fast api:
|
15 |
+
uvicorn app:app --host 0.0.0.0 --port 10000
|
16 |
+
|
17 |
+
python test_server.py
|
app.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
from transformers import pipeline
|
4 |
+
from carga_articulos import cargar_articulos
|
5 |
+
from preprocesamiento_articulos import limpieza_articulos
|
6 |
+
from entrenamiento_modelo import term_document_matrix, tf_idf_score
|
7 |
+
from resultados_consulta import resultados_consulta, detalles_resultados
|
8 |
+
|
9 |
+
def crear_indice():
|
10 |
+
df=cargar_articulos()
|
11 |
+
vocab = limpieza_articulos(df)
|
12 |
+
|
13 |
+
td_matrix=term_document_matrix(df, vocab, 'ID', 'titulo')
|
14 |
+
td_idf_matrix=tf_idf_score(td_matrix, df.ID.values)
|
15 |
+
|
16 |
+
td_idf_matrix.to_csv('articulos_indexados.csv')
|
17 |
+
|
18 |
+
def load_qa_model():
|
19 |
+
model = pipeline("question-answering")
|
20 |
+
return model
|
21 |
+
|
22 |
+
# 4. Use streamlit to create a web app
|
23 |
+
def main():
|
24 |
+
|
25 |
+
#crear_indice()
|
26 |
+
|
27 |
+
st.set_page_config(page_title="Buscador de noticias periodicos dominicanos", page_icon="📰")
|
28 |
+
st.header('El Repartidor Dominicano')
|
29 |
+
st.image('repartidor_periodicos.jpeg', width=150)
|
30 |
+
|
31 |
+
df=cargar_articulos()
|
32 |
+
articulos_indexados = pd.read_csv('articulos_indexados.csv')
|
33 |
+
articulos_indexados = articulos_indexados.set_index('Unnamed: 0')
|
34 |
+
qa = load_qa_model()
|
35 |
+
sentence="The house is blue"
|
36 |
+
|
37 |
+
query = st.text_input(
|
38 |
+
"Escribe tus términos de búsqueda o haz una pregunta terminando con el caracter ?:"
|
39 |
+
)
|
40 |
+
|
41 |
+
if query:
|
42 |
+
|
43 |
+
if ('?' in query):
|
44 |
+
st.write("Contestando a: ", query)
|
45 |
+
answers = qa(question=query, context=sentence)
|
46 |
+
st.info(answers['answer'])
|
47 |
+
|
48 |
+
else:
|
49 |
+
|
50 |
+
st.write("Buscando: ", query)
|
51 |
+
result = resultados_consulta(df,articulos_indexados, query)
|
52 |
+
|
53 |
+
if result.empty:
|
54 |
+
st.info("No se encontraron artículos para la búsqueda solicitada")
|
55 |
+
|
56 |
+
else:
|
57 |
+
#st.write(detalles_resultados(df,result), unsafe_allow_html=True)
|
58 |
+
df_results=detalles_resultados(df,result)
|
59 |
+
N_cards_per_row = 1
|
60 |
+
for n_row, row in df_results.reset_index().iterrows():
|
61 |
+
i = n_row%N_cards_per_row
|
62 |
+
if i==0:
|
63 |
+
st.write("---")
|
64 |
+
cols = st.columns(N_cards_per_row, gap="large")
|
65 |
+
# draw the card
|
66 |
+
with cols[n_row%N_cards_per_row]:
|
67 |
+
st.caption(f"{row['feed'].strip()} - {row['seccion'].strip()} - {row['fecha'].strip()} ")
|
68 |
+
st.markdown(f"**{row['titulo'].strip()}**")
|
69 |
+
st.markdown(f"{row['resumen'].strip()}")
|
70 |
+
st.markdown(f"{row['link']}")
|
71 |
+
if __name__ == "__main__":
|
72 |
+
main()
|
articulos.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae2500b42f5dcd8fbc7e7553228236e4fd97a32d29d24e8afbaa12b111b08889
|
3 |
+
size 99898509
|
articulos_indexados.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
articulos_titulos_bow.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
carga_articulos.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
@st.cache_data
|
5 |
+
def cargar_articulos():
|
6 |
+
articulos = pd.read_csv('articulos.csv')
|
7 |
+
articulos=articulos.iloc[0:100,:] # Cargando solo 20
|
8 |
+
# Eliminando la primera columna
|
9 |
+
articulos.drop("Unnamed: 0", axis=1, inplace=True)
|
10 |
+
# creando columna nueva ID
|
11 |
+
articulos['ID']=articulos.index
|
12 |
+
articulos.ID = pd.Series(["D"+str(ind) for ind in articulos.ID])
|
13 |
+
# Retornando el dataframe con los articulos
|
14 |
+
return articulos
|
config.toml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base = 'dark'
|
3 |
+
primaryColor = 'F63366'
|
4 |
+
font = 'sans serif'
|
entrenamiento_modelo.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
# data = Conjunto de datos. Es un dataframe
|
6 |
+
# vocab = el vocabulario obtenido desde los documentos. Es una lista de cadenas.
|
7 |
+
# document_index = Nombre de la columna del dataframe en donde esta el id de los documentos
|
8 |
+
# text = Nombre de la columna del dataframe donde esta el texto de los documentos
|
9 |
+
|
10 |
+
def term_document_matrix (df, vocab, document_index, text):
|
11 |
+
|
12 |
+
vocab_index = pd.DataFrame(columns=df[document_index], index=vocab).fillna(0)
|
13 |
+
|
14 |
+
for word in vocab_index.index:
|
15 |
+
for doc in df[document_index]:
|
16 |
+
freq = df[df[document_index]==doc][text].values[0].count(word)
|
17 |
+
vocab_index.loc[word,doc]=freq
|
18 |
+
return vocab_index # Devuelve un dataframe con la matriz de los documentos y sus frecuencias
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
# vocab_index = Es el term document matrix que se calcula en la seccion anterior
|
23 |
+
# document_index = Serie conteniendo los ids de los documentos.
|
24 |
+
# inv_df = Nombre de la columna que contiene la frecuencia inversa de los documentos calculada
|
25 |
+
|
26 |
+
|
27 |
+
def tf_idf_score(vocab_index, document_index, inv_df='inverse_document_frequency'):
|
28 |
+
|
29 |
+
total_docx=len(document_index)
|
30 |
+
vocab_index['document_frequency']=vocab_index.sum(axis=1)
|
31 |
+
vocab_index['inverse_document_frequency']=np.log2(total_docx/vocab_index['document_frequency'])
|
32 |
+
|
33 |
+
for word in vocab_index.index:
|
34 |
+
|
35 |
+
for doc in document_index:
|
36 |
+
|
37 |
+
tf_idf=np.log2(1+vocab_index.loc[word,doc]) * np.log2(vocab_index.loc[word][inv_df])
|
38 |
+
vocab_index.loc[word]['tf_idf_'+str(doc)]=tf_idf
|
39 |
+
|
40 |
+
return vocab_index # Devuelve un dataframe que contiene: matriz de los terminos del documento,
|
41 |
+
# la frecuencia de los documentos, la frecuencia inversa de los documentos,
|
42 |
+
# y el score tf_idf
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
def generar_archivo_indexado():
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
return 0
|
preprocesamiento_articulos.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from nltk.tokenize import word_tokenize, RegexpTokenizer
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
6 |
+
import csv
|
7 |
+
|
8 |
+
stopwords_es = stopwords.words('spanish')
|
9 |
+
|
10 |
+
def eliminar_puntuacion(articulo):
|
11 |
+
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
|
12 |
+
new_articulo = ""
|
13 |
+
for x in articulo:
|
14 |
+
if x not in deletetion_symbols:
|
15 |
+
new_articulo += x
|
16 |
+
return new_articulo
|
17 |
+
|
18 |
+
def eliminar_stopwords(articulo):
|
19 |
+
|
20 |
+
articulo_splitted=articulo.split()
|
21 |
+
new_articulo = ""
|
22 |
+
for x in articulo_splitted:
|
23 |
+
if x not in stopwords_es:
|
24 |
+
new_articulo += " " + x
|
25 |
+
return new_articulo
|
26 |
+
|
27 |
+
def limpieza_articulos(df):
|
28 |
+
|
29 |
+
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
|
30 |
+
# Colocando texto en minusculas
|
31 |
+
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower())
|
32 |
+
# Eliminando signos de puntuacion
|
33 |
+
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x))
|
34 |
+
# Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk
|
35 |
+
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x))
|
36 |
+
all_text = ' '. join(df_titulos['titulo'])
|
37 |
+
vocab= np.unique(word_tokenize(all_text))
|
38 |
+
return vocab
|
repartidor_periodicos.jpeg
ADDED
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi[all]
|
2 |
+
openai
|
3 |
+
python-dotenv
|
4 |
+
pydantic==1.*
|
5 |
+
langchain
|
6 |
+
bs4
|
7 |
+
tiktoken
|
8 |
+
nltk
|
9 |
+
scikit-learn
|
resultados_consulta.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from preprocesamiento_articulos import eliminar_puntuacion, eliminar_stopwords
|
4 |
+
|
5 |
+
|
6 |
+
# La idea aqui es colocar el query en minusculas y limpiar el query de palabras vacias y algunos caracteres
|
7 |
+
|
8 |
+
def query_processing (query):
|
9 |
+
query=eliminar_puntuacion(query) # Quitando signos de puntuacion
|
10 |
+
query = query.strip().lower() # Minusculas
|
11 |
+
query = eliminar_stopwords(query)
|
12 |
+
return query
|
13 |
+
|
14 |
+
|
15 |
+
# vocab_index = Matriz de terminos del documento, con todas las frecuencias calculadas. En nuestro caso
|
16 |
+
# es lo que tenemos en el csv. Se debe cargar a un dataframe.
|
17 |
+
# Query = Consulta realizada
|
18 |
+
|
19 |
+
def query_score(vocab_index, query):
|
20 |
+
for word in np.unique(query.split()):
|
21 |
+
freq=query.count(word)
|
22 |
+
if word in vocab_index.index:
|
23 |
+
tf_idf = np.log2(1+freq) * np.log2(vocab_index.loc[word].inverse_document_frequency)
|
24 |
+
vocab_index.loc[word,"query_tf_idf"] = tf_idf
|
25 |
+
vocab_index['query_tf_idf'].fillna(0, inplace=True)
|
26 |
+
|
27 |
+
return vocab_index # Matriz tf_idf para los terminos de los documentos y para el los terminos del query. Es un DF.
|
28 |
+
|
29 |
+
|
30 |
+
# vocab_index = DataFrame que contiene los scores tf-idf por termino para cada documento y para cada query
|
31 |
+
# document_index = Lista de los IDs de los documentos
|
32 |
+
# query_scores = Nombre de la columna del dataframe que contiene los scores tf_idf del query
|
33 |
+
|
34 |
+
|
35 |
+
def cosine_similarity(vocab_index, document_index, query_scores):
|
36 |
+
|
37 |
+
cosine_scores = {}
|
38 |
+
|
39 |
+
query_scalar = np.sqrt(sum(vocab_index[query_scores] ** 2))
|
40 |
+
|
41 |
+
for doc in document_index:
|
42 |
+
|
43 |
+
doc_scalar = np.sqrt(sum(vocab_index[str(doc)] ** 2))
|
44 |
+
dot_prod = sum(vocab_index[str(doc)] * vocab_index[query_scores])
|
45 |
+
cosine = (dot_prod / (query_scalar * doc_scalar))
|
46 |
+
|
47 |
+
cosine_scores[doc] = cosine
|
48 |
+
|
49 |
+
return pd.Series(cosine_scores) # Es una serie pandas que devuelve las puntuaciones de similitud del query para cada
|
50 |
+
# documento
|
51 |
+
|
52 |
+
|
53 |
+
# data: Dataframe que contiene los ids y el texto de los documentos
|
54 |
+
# cosine_scores: Serie que contiene los scores de coseno de los documentos
|
55 |
+
# document_index: Nombre de la columna que contiene los ids de los documentos en el dataframe data
|
56 |
+
|
57 |
+
def retrieve_index(data,cosine_scores, document_index, topn=10):
|
58 |
+
|
59 |
+
data = data.set_index(document_index)
|
60 |
+
data['scores'] = cosine_scores
|
61 |
+
df_top_scores=data.reset_index().sort_values('scores',ascending=False).head(topn)
|
62 |
+
df_top_scores=df_top_scores[df_top_scores['scores'] > 0]
|
63 |
+
return df_top_scores.index # Dataframe original con los scores
|
64 |
+
# del coseno en una columna nueva
|
65 |
+
|
66 |
+
|
67 |
+
def resultados_consulta(df,articulos_indexados, query):
|
68 |
+
indices = pd.Index([], dtype='int64')
|
69 |
+
query=query_processing(query)
|
70 |
+
qs=query_score(articulos_indexados,query)
|
71 |
+
if 'query_tf_idf' in qs.columns:
|
72 |
+
cosenos = cosine_similarity(qs, df['ID'].values, 'query_tf_idf')
|
73 |
+
indices = retrieve_index(df, cosenos, 'ID', 100) # LUEGO PARAMETRIZAR ESTE NUMERO
|
74 |
+
return indices
|
75 |
+
|
76 |
+
def detalles_resultados(df,indices):
|
77 |
+
top=df.loc[indices]
|
78 |
+
top=top.loc[:,['titulo', 'link', 'fecha', 'resumen', 'seccion', 'feed']]
|
79 |
+
return top
|
80 |
+
|