Upload 2 files
Browse files- app.py +9 -3
- process_documents.py +23 -23
app.py
CHANGED
@@ -12,6 +12,8 @@ import base64
|
|
12 |
|
13 |
st.set_page_config(layout="wide")
|
14 |
os.environ["OPENAI_API_KEY"] = "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"
|
|
|
|
|
15 |
|
16 |
get_references = lambda relevant_docs: " ".join(
|
17 |
[f"[{ref}]" for ref in sorted([ref.metadata["chunk_id"] for ref in relevant_docs])]
|
@@ -46,14 +48,18 @@ def process_documents_wrapper(inputs):
|
|
46 |
snippets = process_documents(inputs)
|
47 |
st.session_state.retriever = create_retriever(snippets)
|
48 |
st.session_state.source_doc_urls = inputs
|
49 |
-
st.session_state.index = [
|
|
|
|
|
50 |
response = f"Uploaded and processed documents {inputs}"
|
51 |
st.session_state.messages.append((f"/upload {inputs}", response, ""))
|
52 |
return response
|
53 |
|
54 |
|
55 |
def index_documents_wrapper(inputs=None):
|
56 |
-
response = pd.
|
|
|
|
|
57 |
st.session_state.messages.append(("/index", response, ""))
|
58 |
return response
|
59 |
|
@@ -173,4 +179,4 @@ if __name__ == "__main__":
|
|
173 |
default_function=query_llm_wrapper,
|
174 |
all_commands=all_commands,
|
175 |
)
|
176 |
-
boot(command_center)
|
|
|
12 |
|
13 |
st.set_page_config(layout="wide")
|
14 |
os.environ["OPENAI_API_KEY"] = "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"
|
15 |
+
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
16 |
+
os.environ["LANGCHAIN_API_KEY"] = "ls__aca2f2f97d2f4b9caef0ef75c3c33f9d"
|
17 |
|
18 |
get_references = lambda relevant_docs: " ".join(
|
19 |
[f"[{ref}]" for ref in sorted([ref.metadata["chunk_id"] for ref in relevant_docs])]
|
|
|
48 |
snippets = process_documents(inputs)
|
49 |
st.session_state.retriever = create_retriever(snippets)
|
50 |
st.session_state.source_doc_urls = inputs
|
51 |
+
st.session_state.index = [
|
52 |
+
[snip.metadata["chunk_id"], snip.metadata["header"]] for snip in snippets
|
53 |
+
]
|
54 |
response = f"Uploaded and processed documents {inputs}"
|
55 |
st.session_state.messages.append((f"/upload {inputs}", response, ""))
|
56 |
return response
|
57 |
|
58 |
|
59 |
def index_documents_wrapper(inputs=None):
|
60 |
+
response = pd.DataFrame(
|
61 |
+
st.session_state.index, columns=["id", "reference"]
|
62 |
+
).to_markdown()
|
63 |
st.session_state.messages.append(("/index", response, ""))
|
64 |
return response
|
65 |
|
|
|
179 |
default_function=query_llm_wrapper,
|
180 |
all_commands=all_commands,
|
181 |
)
|
182 |
+
boot(command_center)
|
process_documents.py
CHANGED
@@ -10,17 +10,32 @@ deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
|
|
10 |
|
11 |
def process_documents(urls):
|
12 |
snippets = []
|
13 |
-
for url in urls:
|
14 |
if url.endswith(".pdf"):
|
15 |
-
snippets.extend(process_pdf(url))
|
16 |
else:
|
17 |
-
snippets.extend(process_web(url))
|
18 |
-
for e, snippet in enumerate(snippets):
|
19 |
-
snippet.metadata["chunk_id"] = e
|
20 |
return snippets
|
21 |
|
22 |
|
23 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
data = PDFMinerPDFasHTMLLoader(url).load()[0]
|
25 |
content = BeautifulSoup(data.page_content, "html.parser").find_all("div")
|
26 |
snippets = get_pdf_snippets(content)
|
@@ -36,7 +51,8 @@ def process_pdf(url):
|
|
36 |
"header": " ".join(snip[1]["header_text"].split()[:10]),
|
37 |
"source_url": url,
|
38 |
"source_type": "pdf",
|
39 |
-
"chunk_id": i,
|
|
|
40 |
},
|
41 |
)
|
42 |
for i, snip in enumerate(semantic_snippets)
|
@@ -123,19 +139,3 @@ def get_pdf_semantic_snippets(filtered_snippets, median_font_size):
|
|
123 |
}
|
124 |
semantic_snippets.append((current_content, metadata))
|
125 |
return semantic_snippets
|
126 |
-
|
127 |
-
|
128 |
-
def process_web(url):
|
129 |
-
data = WebBaseLoader(url).load()[0]
|
130 |
-
document_snippets = [
|
131 |
-
Document(
|
132 |
-
page_content=deep_strip(data.page_content),
|
133 |
-
metadata={
|
134 |
-
"header": data.metadata["title"],
|
135 |
-
"source_url": url,
|
136 |
-
"source_type": "web",
|
137 |
-
"chunk_id": 0,
|
138 |
-
},
|
139 |
-
)
|
140 |
-
]
|
141 |
-
return document_snippets
|
|
|
10 |
|
11 |
def process_documents(urls):
|
12 |
snippets = []
|
13 |
+
for source_id, url in enumerate(urls):
|
14 |
if url.endswith(".pdf"):
|
15 |
+
snippets.extend(process_pdf(url, source_id))
|
16 |
else:
|
17 |
+
snippets.extend(process_web(url, source_id))
|
|
|
|
|
18 |
return snippets
|
19 |
|
20 |
|
21 |
+
def process_web(url, source_id):
|
22 |
+
data = WebBaseLoader(url).load()[0]
|
23 |
+
document_snippets = [
|
24 |
+
Document(
|
25 |
+
page_content=deep_strip(data.page_content),
|
26 |
+
metadata={
|
27 |
+
"header": data.metadata["title"],
|
28 |
+
"source_url": url,
|
29 |
+
"source_type": "web",
|
30 |
+
"chunk_id": f"{source_id}_0",
|
31 |
+
"source_id": source_id,
|
32 |
+
},
|
33 |
+
)
|
34 |
+
]
|
35 |
+
return document_snippets
|
36 |
+
|
37 |
+
|
38 |
+
def process_pdf(url, source_id):
|
39 |
data = PDFMinerPDFasHTMLLoader(url).load()[0]
|
40 |
content = BeautifulSoup(data.page_content, "html.parser").find_all("div")
|
41 |
snippets = get_pdf_snippets(content)
|
|
|
51 |
"header": " ".join(snip[1]["header_text"].split()[:10]),
|
52 |
"source_url": url,
|
53 |
"source_type": "pdf",
|
54 |
+
"chunk_id": f"{source_id}_{i}",
|
55 |
+
"source_id": source_id,
|
56 |
},
|
57 |
)
|
58 |
for i, snip in enumerate(semantic_snippets)
|
|
|
139 |
}
|
140 |
semantic_snippets.append((current_content, metadata))
|
141 |
return semantic_snippets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|