Spaces:
Sleeping
Sleeping
import streamlit as st | |
from llama_index import VectorStoreIndex, ServiceContext | |
from llama_index.embeddings import HuggingFaceEmbedding | |
from llama_index.llms import HuggingFaceInferenceAPI | |
from llama_index.schema import Document | |
from PyPDF2 import PdfReader | |
class DocumentLoader: | |
def read_pdf(uploaded_file): | |
pdf_reader = PdfReader(uploaded_file) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
text += pdf_reader.pages[page_num].extract_text() | |
return text | |
def load_documents(uploaded_pdf): | |
file_contents = DocumentLoader.read_pdf(uploaded_pdf) | |
return [Document(text=file_contents)] | |
class IndexCreator: | |
def create_index(documents, hf_token): | |
llm = HuggingFaceInferenceAPI(model_name="HuggingFaceH4/zephyr-7b-alpha", token=hf_token) | |
embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1") | |
service_context = ServiceContext.from_defaults( | |
llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae | |
) | |
index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True) | |
index.storage_context.persist() | |
return index.as_query_engine() | |
class PDFQueryApp: | |
def __init__(self): | |
st.title("Private LLM @Purbayan_Majumder") | |
st.write("Base Model : **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggineFace)**") | |
st.write("Embedding Model : **WhereIsAI/UAE-Large-V1(open-source from HuggineFace)**") | |
st.write("Ask anything from the data that you upload") | |
st.write("Note !! As its runnning on a CPU it takes times 5 to 8 mins for each response") | |
self.hf_token = st.text_input("Enter your Hugging Face token [Free]:") | |
self.uploaded_pdf = st.file_uploader("Upload your data[PDF for now]", type=['pdf']) | |
self.query_engine = None | |
def load_and_create_index(self): | |
if self.uploaded_pdf: | |
st.success("Dataset has been loaded into the model succesfully") | |
documents = DocumentLoader.load_documents(self.uploaded_pdf) | |
self.query_engine = IndexCreator.create_index(documents, self.hf_token) | |
st.success("Vector embeddings have been succesfully created and initiated") | |
else: | |
st.warning("You have to upload a PDF file first.") | |
def run_query(self, user_query): | |
if self.query_engine and user_query: | |
with st.spinner('Fetching the response from the model Please wait !!!!...'): | |
response = self.query_engine.query(user_query) | |
st.markdown(f"**Response:** {response}") | |
else: | |
st.warning("Please load documents and create vector embeddings before querying.") | |
if __name__ == "__main__": | |
app = PDFQueryApp() | |
app.load_and_create_index() | |
user_query = st.text_input("Enter your query from the dataset:") | |
app.run_query(user_query) |