Spaces:
Runtime error
Runtime error
import os | |
import PyPDF2 | |
import random | |
import itertools | |
import streamlit as st | |
from io import StringIO | |
from langchain.vectorstores import FAISS | |
from langchain.chains import RetrievalQA | |
from langchain.chat_models import ChatOpenAI | |
from langchain.retrievers import SVMRetriever | |
from langchain.chains import QAGenerationChain | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
from langchain.callbacks.base import CallbackManager | |
from langchain.embeddings import HuggingFaceEmbeddings | |
st.set_page_config(page_title="PDF Analyzer",page_icon=':shark:') | |
def load_docs(files): | |
st.info("`Reading doc ...`") | |
all_text = "" | |
for file_path in files: | |
file_extension = os.path.splitext(file_path.name)[1] | |
if file_extension == ".pdf": | |
pdf_reader = PyPDF2.PdfReader(file_path) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
all_text += text | |
elif file_extension == ".txt": | |
stringio = StringIO(file_path.getvalue().decode("utf-8")) | |
text = stringio.read() | |
all_text += text | |
else: | |
st.warning('Please provide txt or pdf.', icon="⚠️") | |
return all_text | |
def create_retriever(_embeddings, splits, retriever_type): | |
if retriever_type == "SIMILARITY SEARCH": | |
try: | |
vectorstore = FAISS.from_texts(splits, _embeddings) | |
except (IndexError, ValueError) as e: | |
st.error(f"Error creating vectorstore: {e}") | |
return | |
retriever = vectorstore.as_retriever(k=5) | |
elif retriever_type == "SUPPORT VECTOR MACHINES": | |
retriever = SVMRetriever.from_texts(splits, _embeddings) | |
return retriever | |
def split_texts(text, chunk_size, overlap, split_method): | |
# Split texts | |
# IN: text, chunk size, overlap, split_method | |
# OUT: list of str splits | |
st.info("`Splitting doc ...`") | |
split_method = "RecursiveTextSplitter" | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, chunk_overlap=overlap) | |
splits = text_splitter.split_text(text) | |
if not splits: | |
st.error("Failed to split document") | |
st.stop() | |
return splits | |
def generate_eval(text, N, chunk): | |
# Generate N questions from context of chunk chars | |
# IN: text, N questions, chunk size to draw question from in the doc | |
# OUT: eval set as JSON list | |
st.info("`Generating sample questions ...`") | |
n = len(text) | |
starting_indices = [random.randint(0, n-chunk) for _ in range(N)] | |
sub_sequences = [text[i:i+chunk] for i in starting_indices] | |
chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0)) | |
eval_set = [] | |
for i, b in enumerate(sub_sequences): | |
try: | |
qa = chain.run(b) | |
eval_set.append(qa) | |
st.write("Creating Question:",i+1) | |
except: | |
st.warning('Error generating question %s.' % str(i+1), icon="⚠️") | |
eval_set_full = list(itertools.chain.from_iterable(eval_set)) | |
return eval_set_full | |
# ... | |
def main(): | |
foot = f""" | |
<div style=" | |
position: fixed; | |
bottom: 0; | |
left: 30%; | |
right: 0; | |
width: 50%; | |
padding: 0px 0px; | |
text-align: center; | |
"> | |
<p>Made by <a href='https://twitter.com/mehmet_ba7'>Mehmet Balioglu</a></p> | |
</div> | |
""" | |
st.markdown(foot, unsafe_allow_html=True) | |
# Add custom CSS | |
st.markdown( | |
""" | |
<style> | |
#MainMenu {visibility: hidden; | |
# } | |
footer {visibility: hidden; | |
} | |
.css-card { | |
border-radius: 0px; | |
padding: 30px 10px 10px 10px; | |
background-color: #f8f9fa; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
margin-bottom: 10px; | |
font-family: "IBM Plex Sans", sans-serif; | |
} | |
.card-tag { | |
border-radius: 0px; | |
padding: 1px 5px 1px 5px; | |
margin-bottom: 10px; | |
position: absolute; | |
left: 0px; | |
top: 0px; | |
font-size: 0.6rem; | |
font-family: "IBM Plex Sans", sans-serif; | |
color: white; | |
background-color: green; | |
} | |
.css-zt5igj {left:0; | |
} | |
span.css-10trblm {margin-left:0; | |
} | |
div.css-1kyxreq {margin-top: -40px; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True, | |
) | |
st.sidebar.image("img/logo1.png") | |
st.write( | |
f""" | |
<div style="display: flex; align-items: center; margin-left: 0;"> | |
<h1 style="display: inline-block;">PDF Analyzer</h1> | |
<sup style="margin-left:5px;font-size:small; color: green;">beta</sup> | |
</div> | |
""", | |
unsafe_allow_html=True, | |
) | |
st.sidebar.title("Menu") | |
embedding_option = st.sidebar.radio( | |
"Choose Embeddings", ["OpenAI Embeddings", "HuggingFace Embeddings(slower)"]) | |
retriever_type = st.sidebar.selectbox( | |
"Choose Retriever", ["SIMILARITY SEARCH", "SUPPORT VECTOR MACHINES"]) | |
# Use RecursiveCharacterTextSplitter as the default and only text splitter | |
splitter_type = "RecursiveCharacterTextSplitter" | |
if 'openai_api_key' not in st.session_state: | |
openai_api_key = st.text_input( | |
'Please enter your OpenAI API key or [get one here](https://platform.openai.com/account/api-keys)', value="", placeholder="Enter the OpenAI API key which begins with sk-") | |
if openai_api_key: | |
st.session_state.openai_api_key = openai_api_key | |
os.environ["OPENAI_API_KEY"] = openai_api_key | |
else: | |
#warning_text = 'Please enter your OpenAI API key. Get yours from here: [link](https://platform.openai.com/account/api-keys)' | |
#warning_html = f'<span>{warning_text}</span>' | |
#st.markdown(warning_html, unsafe_allow_html=True) | |
return | |
else: | |
os.environ["OPENAI_API_KEY"] = st.session_state.openai_api_key | |
uploaded_files = st.file_uploader("Upload a PDF or TXT Document", type=[ | |
"pdf", "txt"], accept_multiple_files=True) | |
if uploaded_files: | |
# Check if last_uploaded_files is not in session_state or if uploaded_files are different from last_uploaded_files | |
if 'last_uploaded_files' not in st.session_state or st.session_state.last_uploaded_files != uploaded_files: | |
st.session_state.last_uploaded_files = uploaded_files | |
if 'eval_set' in st.session_state: | |
del st.session_state['eval_set'] | |
# Load and process the uploaded PDF or TXT files. | |
loaded_text = load_docs(uploaded_files) | |
st.write("Documents uploaded and processed.") | |
# Split the document into chunks | |
splits = split_texts(loaded_text, chunk_size=1000, | |
overlap=0, split_method=splitter_type) | |
# Display the number of text chunks | |
num_chunks = len(splits) | |
st.write(f"Number of text chunks: {num_chunks}") | |
# Embed using OpenAI embeddings | |
# Embed using OpenAI embeddings or HuggingFace embeddings | |
if embedding_option == "OpenAI Embeddings": | |
embeddings = OpenAIEmbeddings() | |
elif embedding_option == "HuggingFace Embeddings(slower)": | |
# Replace "bert-base-uncased" with the desired HuggingFace model | |
embeddings = HuggingFaceEmbeddings() | |
retriever = create_retriever(embeddings, splits, retriever_type) | |
# Initialize the RetrievalQA chain with streaming output | |
callback_handler = StreamingStdOutCallbackHandler() | |
callback_manager = CallbackManager([callback_handler]) | |
chat_openai = ChatOpenAI( | |
streaming=True, callback_manager=callback_manager, verbose=True, temperature=0) | |
qa = RetrievalQA.from_chain_type(llm=chat_openai, retriever=retriever, chain_type="stuff", verbose=True) | |
# Check if there are no generated question-answer pairs in the session state | |
if 'eval_set' not in st.session_state: | |
# Use the generate_eval function to generate question-answer pairs | |
num_eval_questions = 10 # Number of question-answer pairs to generate | |
st.session_state.eval_set = generate_eval( | |
loaded_text, num_eval_questions, 3000) | |
# Display the question-answer pairs in the sidebar with smaller text | |
for i, qa_pair in enumerate(st.session_state.eval_set): | |
st.sidebar.markdown( | |
f""" | |
<div class="css-card"> | |
<span class="card-tag">Question {i + 1}</span> | |
<p style="font-size: 12px;">{qa_pair['question']}</p> | |
<p style="font-size: 12px;">{qa_pair['answer']}</p> | |
</div> | |
""", | |
unsafe_allow_html=True, | |
) | |
# <h4 style="font-size: 14px;">Question {i + 1}:</h4> | |
# <h4 style="font-size: 14px;">Answer {i + 1}:</h4> | |
st.write("Ready to answer questions.") | |
# Question and answering | |
user_question = st.text_input("Enter your question:") | |
if user_question: | |
answer = qa.run(user_question) | |
st.write("Answer:", answer) | |
if __name__ == "__main__": | |
main() | |