Spaces:
Runtime error
Runtime error
abnerguzman
commited on
Commit
•
84b9cf4
1
Parent(s):
669631f
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ from tqdm.autonotebook import tqdm
|
|
12 |
from pinecone import Pinecone, ServerlessSpec
|
13 |
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
|
14 |
|
15 |
-
index_name = "prorata-postman-ds-128"
|
16 |
index = pc.Index(index_name)
|
17 |
|
18 |
|
@@ -75,30 +75,101 @@ def get_matches_reranked(q, k=20):
|
|
75 |
|
76 |
return matches_colbertv2
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
import gradio as gr
|
80 |
from io import StringIO
|
|
|
81 |
|
82 |
def output_chunks_reranked(msg):
|
83 |
matches_colbertv2 = get_matches_reranked(msg, k=20)
|
84 |
-
|
85 |
_out = StringIO()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
for idx, match in enumerate(matches_colbertv2):
|
87 |
-
print(
|
88 |
-
print(
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
return _out.getvalue()
|
92 |
|
93 |
-
|
94 |
with gr.Blocks() as demo:
|
95 |
msg = gr.Textbox(label='Target')
|
96 |
-
results_box = gr.Textbox(label='Matches', lines=30, autoscroll=False)
|
|
|
97 |
|
98 |
msg.submit(output_chunks_reranked, msg, results_box, queue=False)
|
99 |
-
|
100 |
-
demo.queue()
|
101 |
-
demo.launch(share=True)
|
102 |
|
103 |
|
104 |
if __name__ == "__main__":
|
|
|
12 |
from pinecone import Pinecone, ServerlessSpec
|
13 |
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
|
14 |
|
15 |
+
index_name = "prorata-postman-ds-128-v2"
|
16 |
index = pc.Index(index_name)
|
17 |
|
18 |
|
|
|
75 |
|
76 |
return matches_colbertv2
|
77 |
|
78 |
+
def filter_matches(matches_colbertv2, score_thr=0.0):
|
79 |
+
matches_colbertv2_f = []
|
80 |
+
url_to_chunk_l = {}
|
81 |
+
|
82 |
+
for idx, match in enumerate(matches_colbertv2):
|
83 |
+
if match['colbertv2_score'] > score_thr:
|
84 |
+
_url = match['metadata']['url']
|
85 |
+
if not _url in url_to_chunk_l:
|
86 |
+
url_to_chunk_l[_url] = []
|
87 |
+
url_to_chunk_l[_url].append(match)
|
88 |
+
|
89 |
+
matches_colbertv2_f.append(match)
|
90 |
+
|
91 |
+
return matches_colbertv2_f
|
92 |
+
|
93 |
+
style_str = """
|
94 |
+
<style>
|
95 |
+
.doc-title {
|
96 |
+
/* font-family: cursive, sans-serif; */
|
97 |
+
font-family: Optima, sans-serif;
|
98 |
+
width: 100%;
|
99 |
+
display: inline-block;
|
100 |
+
font-size: 2em;
|
101 |
+
font-weight: bolder;
|
102 |
+
padding-top: 20px;
|
103 |
+
/* font-style: italic; */
|
104 |
+
}
|
105 |
+
.doc-url {
|
106 |
+
/* font-family: cursive, sans-serif; */
|
107 |
+
font-size: 1em;
|
108 |
+
padding-left: 40px;
|
109 |
+
padding-bottom: 10px;
|
110 |
+
/* font-weight: bolder; */
|
111 |
+
/* font-style: italic; */
|
112 |
+
}
|
113 |
+
.doc-text {
|
114 |
+
/* font-family: cursive, sans-serif; */
|
115 |
+
font-family: Optima, sans-serif;
|
116 |
+
font-size: 1.5em;
|
117 |
+
padding-left: 40px;
|
118 |
+
padding-bottom: 20px;
|
119 |
+
/* font-weight: bolder; */
|
120 |
+
/* font-style: italic; */
|
121 |
+
}
|
122 |
+
.doc-title > img {
|
123 |
+
width: 22px;
|
124 |
+
height: 22px;
|
125 |
+
border-radius: 50%;
|
126 |
+
overflow: hidden;
|
127 |
+
background-color: transparent;
|
128 |
+
display: inline-block;
|
129 |
+
vertical-align: middle;
|
130 |
+
}
|
131 |
+
.doc-title > score {
|
132 |
+
font-family: Optima, sans-serif;
|
133 |
+
font-weight: normal;
|
134 |
+
float: right;
|
135 |
+
}
|
136 |
+
</style>
|
137 |
+
"""
|
138 |
|
139 |
import gradio as gr
|
140 |
from io import StringIO
|
141 |
+
from urllib.parse import urlparse
|
142 |
|
143 |
def output_chunks_reranked(msg):
|
144 |
matches_colbertv2 = get_matches_reranked(msg, k=20)
|
145 |
+
matches_colbertv2 = filter_matches(matches_colbertv2, score_thr=0.55)
|
146 |
_out = StringIO()
|
147 |
+
|
148 |
+
if not matches_colbertv2:
|
149 |
+
print(style_str, file=_out)
|
150 |
+
print(f"<div>", file=_out)
|
151 |
+
print(f"<div class=\"doc-title\">No sources relevant to this target were found.</div>", file=_out)
|
152 |
+
print(f"</div>", file=_out)
|
153 |
+
return _out.getvalue()
|
154 |
+
|
155 |
for idx, match in enumerate(matches_colbertv2):
|
156 |
+
print(style_str, file=_out)
|
157 |
+
print(f"<div>", file=_out)
|
158 |
+
favicon = f"<img src=\"https://www.google.com/s2/favicons?sz=128&domain={urlparse(match['metadata']['url']).netloc}\"/>"
|
159 |
+
print(f"<div class=\"doc-title\">{favicon}  {match['metadata']['title']}<score>{match['colbertv2_score']:.2f}</score></div>", file=_out)
|
160 |
+
print(f"<div class=\"doc-url\"><a href=\"{match['metadata']['url']}\" target=\"_blank\">{match['metadata']['url']}</a></div>", file=_out)
|
161 |
+
# print(f" (Score: {match['colbertv2_score']:.2f})", file=_out)
|
162 |
+
print(f"<div class=\"doc-text\">{match['metadata']['text']}</div>", file=_out)
|
163 |
+
print(f"</div>", file=_out)
|
164 |
|
165 |
return _out.getvalue()
|
166 |
|
|
|
167 |
with gr.Blocks() as demo:
|
168 |
msg = gr.Textbox(label='Target')
|
169 |
+
# results_box = gr.Textbox(label='Matches', lines=30, autoscroll=False)
|
170 |
+
results_box = gr.HTML(label='Matches')
|
171 |
|
172 |
msg.submit(output_chunks_reranked, msg, results_box, queue=False)
|
|
|
|
|
|
|
173 |
|
174 |
|
175 |
if __name__ == "__main__":
|