Ritvik19 commited on
Commit
9bb602c
1 Parent(s): c44c8ed

minor fixes

Browse files
Files changed (2) hide show
  1. app.py +10 -3
  2. process_documents.py +5 -0
app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  import os
3
  import pandas as pd
4
  from command_center import CommandCenter
5
- from process_documents import process_documents
6
  from embed_documents import create_retriever
7
  import json
8
  from langchain.callbacks import get_openai_callback
@@ -59,7 +59,12 @@ def process_documents_wrapper(inputs):
59
  st.session_state.retriever = create_retriever(snippets)
60
  st.session_state.source_doc_urls = inputs
61
  st.session_state.index = [
62
- [snip.metadata["chunk_id"], snip.metadata["header"]] for snip in snippets
 
 
 
 
 
63
  ]
64
  response = f"Uploaded and processed documents {inputs}"
65
  st.session_state.messages.append((f"/add-papers {inputs}", response, "identity"))
@@ -68,7 +73,9 @@ def process_documents_wrapper(inputs):
68
 
69
 
70
  def index_documents_wrapper(inputs=None):
71
- response = pd.DataFrame(st.session_state.index, columns=["id", "reference"])
 
 
72
  st.session_state.messages.append(("/library", response, "dataframe"))
73
  return (response, "dataframe")
74
 
 
2
  import os
3
  import pandas as pd
4
  from command_center import CommandCenter
5
+ from process_documents import process_documents, num_tokens
6
  from embed_documents import create_retriever
7
  import json
8
  from langchain.callbacks import get_openai_callback
 
59
  st.session_state.retriever = create_retriever(snippets)
60
  st.session_state.source_doc_urls = inputs
61
  st.session_state.index = [
62
+ [
63
+ snip.metadata["chunk_id"],
64
+ snip.metadata["header"],
65
+ num_tokens(snip.page_content),
66
+ ]
67
+ for snip in snippets
68
  ]
69
  response = f"Uploaded and processed documents {inputs}"
70
  st.session_state.messages.append((f"/add-papers {inputs}", response, "identity"))
 
73
 
74
 
75
  def index_documents_wrapper(inputs=None):
76
+ response = pd.DataFrame(
77
+ st.session_state.index, columns=["id", "reference", "tokens"]
78
+ )
79
  st.session_state.messages.append(("/library", response, "dataframe"))
80
  return (response, "dataframe")
81
 
process_documents.py CHANGED
@@ -4,6 +4,7 @@ from statistics import median
4
  from bs4 import BeautifulSoup
5
  from langchain.docstore.document import Document
6
  from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
 
7
 
8
  deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
9
 
@@ -153,3 +154,7 @@ def get_pdf_semantic_snippets(filtered_snippets, median_font_size):
153
  }
154
  semantic_snippets.append((current_content, metadata))
155
  return semantic_snippets
 
 
 
 
 
4
  from bs4 import BeautifulSoup
5
  from langchain.docstore.document import Document
6
  from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
7
+ import tiktoken
8
 
9
  deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
10
 
 
154
  }
155
  semantic_snippets.append((current_content, metadata))
156
  return semantic_snippets
157
+
158
+
159
+ def num_tokens(string):
160
+ return len(tiktoken.get_encoding("cl100k_base").encode(string))