Ritvik19 commited on
Commit
5f9938a
1 Parent(s): f761d00

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +9 -3
  2. process_documents.py +23 -23
app.py CHANGED
@@ -12,6 +12,8 @@ import base64
12
 
13
  st.set_page_config(layout="wide")
14
  os.environ["OPENAI_API_KEY"] = "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"
 
 
15
 
16
  get_references = lambda relevant_docs: " ".join(
17
  [f"[{ref}]" for ref in sorted([ref.metadata["chunk_id"] for ref in relevant_docs])]
@@ -46,14 +48,18 @@ def process_documents_wrapper(inputs):
46
  snippets = process_documents(inputs)
47
  st.session_state.retriever = create_retriever(snippets)
48
  st.session_state.source_doc_urls = inputs
49
- st.session_state.index = [snip.metadata["header"] for snip in snippets]
 
 
50
  response = f"Uploaded and processed documents {inputs}"
51
  st.session_state.messages.append((f"/upload {inputs}", response, ""))
52
  return response
53
 
54
 
55
  def index_documents_wrapper(inputs=None):
56
- response = pd.Series(st.session_state.index, name="references").to_markdown()
 
 
57
  st.session_state.messages.append(("/index", response, ""))
58
  return response
59
 
@@ -173,4 +179,4 @@ if __name__ == "__main__":
173
  default_function=query_llm_wrapper,
174
  all_commands=all_commands,
175
  )
176
- boot(command_center)
 
12
 
13
  st.set_page_config(layout="wide")
14
  os.environ["OPENAI_API_KEY"] = "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"
15
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
16
+ os.environ["LANGCHAIN_API_KEY"] = "ls__aca2f2f97d2f4b9caef0ef75c3c33f9d"
17
 
18
  get_references = lambda relevant_docs: " ".join(
19
  [f"[{ref}]" for ref in sorted([ref.metadata["chunk_id"] for ref in relevant_docs])]
 
48
  snippets = process_documents(inputs)
49
  st.session_state.retriever = create_retriever(snippets)
50
  st.session_state.source_doc_urls = inputs
51
+ st.session_state.index = [
52
+ [snip.metadata["chunk_id"], snip.metadata["header"]] for snip in snippets
53
+ ]
54
  response = f"Uploaded and processed documents {inputs}"
55
  st.session_state.messages.append((f"/upload {inputs}", response, ""))
56
  return response
57
 
58
 
59
  def index_documents_wrapper(inputs=None):
60
+ response = pd.DataFrame(
61
+ st.session_state.index, columns=["id", "reference"]
62
+ ).to_markdown()
63
  st.session_state.messages.append(("/index", response, ""))
64
  return response
65
 
 
179
  default_function=query_llm_wrapper,
180
  all_commands=all_commands,
181
  )
182
+ boot(command_center)
process_documents.py CHANGED
@@ -10,17 +10,32 @@ deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
10
 
11
  def process_documents(urls):
12
  snippets = []
13
- for url in urls:
14
  if url.endswith(".pdf"):
15
- snippets.extend(process_pdf(url))
16
  else:
17
- snippets.extend(process_web(url))
18
- for e, snippet in enumerate(snippets):
19
- snippet.metadata["chunk_id"] = e
20
  return snippets
21
 
22
 
23
- def process_pdf(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  data = PDFMinerPDFasHTMLLoader(url).load()[0]
25
  content = BeautifulSoup(data.page_content, "html.parser").find_all("div")
26
  snippets = get_pdf_snippets(content)
@@ -36,7 +51,8 @@ def process_pdf(url):
36
  "header": " ".join(snip[1]["header_text"].split()[:10]),
37
  "source_url": url,
38
  "source_type": "pdf",
39
- "chunk_id": i,
 
40
  },
41
  )
42
  for i, snip in enumerate(semantic_snippets)
@@ -123,19 +139,3 @@ def get_pdf_semantic_snippets(filtered_snippets, median_font_size):
123
  }
124
  semantic_snippets.append((current_content, metadata))
125
  return semantic_snippets
126
-
127
-
128
- def process_web(url):
129
- data = WebBaseLoader(url).load()[0]
130
- document_snippets = [
131
- Document(
132
- page_content=deep_strip(data.page_content),
133
- metadata={
134
- "header": data.metadata["title"],
135
- "source_url": url,
136
- "source_type": "web",
137
- "chunk_id": 0,
138
- },
139
- )
140
- ]
141
- return document_snippets
 
10
 
11
  def process_documents(urls):
12
  snippets = []
13
+ for source_id, url in enumerate(urls):
14
  if url.endswith(".pdf"):
15
+ snippets.extend(process_pdf(url, source_id))
16
  else:
17
+ snippets.extend(process_web(url, source_id))
 
 
18
  return snippets
19
 
20
 
21
+ def process_web(url, source_id):
22
+ data = WebBaseLoader(url).load()[0]
23
+ document_snippets = [
24
+ Document(
25
+ page_content=deep_strip(data.page_content),
26
+ metadata={
27
+ "header": data.metadata["title"],
28
+ "source_url": url,
29
+ "source_type": "web",
30
+ "chunk_id": f"{source_id}_0",
31
+ "source_id": source_id,
32
+ },
33
+ )
34
+ ]
35
+ return document_snippets
36
+
37
+
38
+ def process_pdf(url, source_id):
39
  data = PDFMinerPDFasHTMLLoader(url).load()[0]
40
  content = BeautifulSoup(data.page_content, "html.parser").find_all("div")
41
  snippets = get_pdf_snippets(content)
 
51
  "header": " ".join(snip[1]["header_text"].split()[:10]),
52
  "source_url": url,
53
  "source_type": "pdf",
54
+ "chunk_id": f"{source_id}_{i}",
55
+ "source_id": source_id,
56
  },
57
  )
58
  for i, snip in enumerate(semantic_snippets)
 
139
  }
140
  semantic_snippets.append((current_content, metadata))
141
  return semantic_snippets