minor fixes
Browse files- app.py +10 -3
- process_documents.py +5 -0
app.py
CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
|
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
from command_center import CommandCenter
|
5 |
-
from process_documents import process_documents
|
6 |
from embed_documents import create_retriever
|
7 |
import json
|
8 |
from langchain.callbacks import get_openai_callback
|
@@ -59,7 +59,12 @@ def process_documents_wrapper(inputs):
|
|
59 |
st.session_state.retriever = create_retriever(snippets)
|
60 |
st.session_state.source_doc_urls = inputs
|
61 |
st.session_state.index = [
|
62 |
-
[
|
|
|
|
|
|
|
|
|
|
|
63 |
]
|
64 |
response = f"Uploaded and processed documents {inputs}"
|
65 |
st.session_state.messages.append((f"/add-papers {inputs}", response, "identity"))
|
@@ -68,7 +73,9 @@ def process_documents_wrapper(inputs):
|
|
68 |
|
69 |
|
70 |
def index_documents_wrapper(inputs=None):
|
71 |
-
response = pd.DataFrame(
|
|
|
|
|
72 |
st.session_state.messages.append(("/library", response, "dataframe"))
|
73 |
return (response, "dataframe")
|
74 |
|
|
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
from command_center import CommandCenter
|
5 |
+
from process_documents import process_documents, num_tokens
|
6 |
from embed_documents import create_retriever
|
7 |
import json
|
8 |
from langchain.callbacks import get_openai_callback
|
|
|
59 |
st.session_state.retriever = create_retriever(snippets)
|
60 |
st.session_state.source_doc_urls = inputs
|
61 |
st.session_state.index = [
|
62 |
+
[
|
63 |
+
snip.metadata["chunk_id"],
|
64 |
+
snip.metadata["header"],
|
65 |
+
num_tokens(snip.page_content),
|
66 |
+
]
|
67 |
+
for snip in snippets
|
68 |
]
|
69 |
response = f"Uploaded and processed documents {inputs}"
|
70 |
st.session_state.messages.append((f"/add-papers {inputs}", response, "identity"))
|
|
|
73 |
|
74 |
|
75 |
def index_documents_wrapper(inputs=None):
|
76 |
+
response = pd.DataFrame(
|
77 |
+
st.session_state.index, columns=["id", "reference", "tokens"]
|
78 |
+
)
|
79 |
st.session_state.messages.append(("/library", response, "dataframe"))
|
80 |
return (response, "dataframe")
|
81 |
|
process_documents.py
CHANGED
@@ -4,6 +4,7 @@ from statistics import median
|
|
4 |
from bs4 import BeautifulSoup
|
5 |
from langchain.docstore.document import Document
|
6 |
from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
|
|
|
7 |
|
8 |
deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
|
9 |
|
@@ -153,3 +154,7 @@ def get_pdf_semantic_snippets(filtered_snippets, median_font_size):
|
|
153 |
}
|
154 |
semantic_snippets.append((current_content, metadata))
|
155 |
return semantic_snippets
|
|
|
|
|
|
|
|
|
|
4 |
from bs4 import BeautifulSoup
|
5 |
from langchain.docstore.document import Document
|
6 |
from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
|
7 |
+
import tiktoken
|
8 |
|
9 |
deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
|
10 |
|
|
|
154 |
}
|
155 |
semantic_snippets.append((current_content, metadata))
|
156 |
return semantic_snippets
|
157 |
+
|
158 |
+
|
159 |
+
def num_tokens(string):
|
160 |
+
return len(tiktoken.get_encoding("cl100k_base").encode(string))
|