Adipta commited on
Commit
23f71e4
1 Parent(s): 3891d10

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +21 -0
  2. config.py +10 -0
  3. interface.py +32 -0
  4. pdfchatbot.py +185 -0
  5. requirements.txt +15 -0
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from interface import create_demo
2
+ from pdfchatbot import PDFChatBot
3
+
4
+ # Create Gradio interface
5
+ demo, chat_history, show_img, txt, submit_button, uploaded_pdf = create_demo()
6
+
7
+ # Create PDFChatBot instance
8
+ pdf_chatbot = PDFChatBot()
9
+
10
+ # Set up event handlers
11
+ with demo:
12
+ # Event handler for uploading a PDF
13
+ uploaded_pdf.upload(pdf_chatbot.render_file, inputs=[uploaded_pdf], outputs=[show_img])
14
+
15
+ # Event handler for submitting text and generating response
16
+ submit_button.click(pdf_chatbot.add_text, inputs=[chat_history, txt], outputs=[chat_history], queue=False).\
17
+ success(pdf_chatbot.generate_response, inputs=[chat_history, txt, uploaded_pdf], outputs=[chat_history, txt]).\
18
+ success(pdf_chatbot.render_file, inputs=[uploaded_pdf], outputs=[show_img])
19
+
20
+ if __name__ == "__main__":
21
+ demo.launch()
config.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+
3
+
4
+ class ModelConfig(BaseSettings):
5
+ MODEL_EMBEDDINGS: str = "sentence-transformers/all-MiniLM-L6-v2"
6
+ AUTO_TOKENIZER: str = "meta-llama/Llama-2-7b-chat-hf"
7
+ MODEL_LLM: str = "meta-llama/Llama-2-7b-chat-hf"
8
+
9
+
10
+ MODEL_CONFIG = ModelConfig()
interface.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ # Gradio application setup
4
+ def create_demo():
5
+ with gr.Blocks(title= "RAG Chatbot Q&A",
6
+ theme = "Soft"
7
+ ) as demo:
8
+ with gr.Column():
9
+ with gr.Row():
10
+ chat_history = gr.Chatbot(value=[], elem_id='chatbot', height=680)
11
+ show_img = gr.Image(label='Overview', height=680)
12
+
13
+ with gr.Row():
14
+ with gr.Column(scale=0.60):
15
+ text_input = gr.Textbox(
16
+ show_label=False,
17
+ placeholder="Type here to ask your PDF",
18
+ container=False)
19
+
20
+ with gr.Column(scale=0.20):
21
+ submit_button = gr.Button('Send')
22
+
23
+ with gr.Column(scale=0.20):
24
+ uploaded_pdf = gr.UploadButton("📁 Upload PDF", file_types=[".pdf"])
25
+
26
+
27
+ return demo, chat_history, show_img, text_input, submit_button, uploaded_pdf
28
+
29
+ if __name__ == '__main__':
30
+ demo, chatbot, show_img, text_input, submit_button, uploaded_pdf = create_demo()
31
+ demo.queue()
32
+ demo.launch()
pdfchatbot.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import fitz
3
+ import torch
4
+ import gradio as gr
5
+ import weaviate
6
+ import os
7
+ from PIL import Image
8
+ from config import MODEL_CONFIG
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
+ from langchain_weaviate.vectorstores import WeaviateVectorStore
11
+ from langchain.text_splitter import CharacterTextSplitter
12
+ from langchain.llms import HuggingFacePipeline
13
+ from langchain.chains import ConversationalRetrievalChain
14
+ from langchain.document_loaders import PyPDFLoader
15
+ from langchain.prompts import PromptTemplate
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
17
+
18
+ class PDFChatBot:
19
+ def __init__(self):
20
+ """
21
+ Initialize the PDFChatBot instance.
22
+ """
23
+ self.processed = False
24
+ self.page = 0
25
+ self.chat_history = []
26
+ # Initialize other attributes to None
27
+ self.prompt = None
28
+ self.documents = None
29
+ self.embeddings = None
30
+ self.vectordb = None
31
+ self.tokenizer = None
32
+ self.model = None
33
+ self.pipeline = None
34
+ self.chain = None
35
+
36
+ def add_text(self, history, text):
37
+ """
38
+ Add user-entered text to the chat history.
39
+
40
+ Parameters:
41
+ history (list): List of chat history tuples.
42
+ text (str): User-entered text.
43
+
44
+ Returns:
45
+ list: Updated chat history.
46
+ """
47
+ if not text:
48
+ raise gr.Error('Enter text')
49
+ history.append((text, ''))
50
+ return history
51
+
52
+ def create_prompt_template(self):
53
+ """
54
+ Create a prompt template for the chatbot.
55
+ """
56
+ template = (
57
+ f"The assistant should provide detailed explanations."
58
+ "Combine the chat history and follow up question into "
59
+ "Follow up question: What is this"
60
+ )
61
+ self.prompt = PromptTemplate.from_template(template)
62
+
63
+ def load_embeddings(self):
64
+ """
65
+ Load embeddings from Hugging Face and set in the config file.
66
+ """
67
+ self.embeddings = HuggingFaceEmbeddings(model_name=MODEL_CONFIG.MODEL_EMBEDDINGS)
68
+
69
+ def load_vectordb(self):
70
+ """
71
+ Load the vector database from the documents and embeddings.
72
+ """
73
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
74
+ docs = text_splitter.split_documents(self.documents)
75
+
76
+ auth_config = weaviate.AuthApiKey(api_key=os.getenv("WEAVIATE_API_KEY"))
77
+
78
+ weaviate_client = weaviate.Client(
79
+ url=os.getenv("WEAVIATE_URL"),
80
+ auth_client_secret=auth_config,
81
+ )
82
+
83
+ self.vectordb = WeaviateVectorStore.from_documents(docs, self.embeddings, client=weaviate_client)
84
+
85
+ def load_tokenizer(self):
86
+ """
87
+ Load the tokenizer from Hugging Face and set in the config file.
88
+ """
89
+ self.tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG.AUTO_TOKENIZER)
90
+
91
+ def load_model(self):
92
+ """
93
+ Load the causal language model from Hugging Face and set in the config file.
94
+ """
95
+ self.model = AutoModelForCausalLM.from_pretrained(
96
+ MODEL_CONFIG.MODEL_LLM,
97
+ device_map='auto',
98
+ torch_dtype=torch.float32,
99
+ token=True,
100
+ load_in_8bit=False
101
+ )
102
+
103
+ def create_pipeline(self):
104
+ """
105
+ Create a pipeline for text generation using the loaded model and tokenizer.
106
+ """
107
+ pipe = pipeline(
108
+ model=self.model,
109
+ task='text-generation',
110
+ tokenizer=self.tokenizer,
111
+ max_new_tokens=200
112
+ )
113
+ self.pipeline = HuggingFacePipeline(pipeline=pipe)
114
+
115
+ def create_chain(self):
116
+ """
117
+ Create a Conversational Retrieval Chain
118
+ """
119
+ self.chain = ConversationalRetrievalChain.from_llm(
120
+ self.pipeline,
121
+ chain_type="stuff",
122
+ retriever=self.vectordb.as_retriever(search_kwargs={"k": 1}),
123
+ condense_question_prompt=self.prompt,
124
+ return_source_documents=True
125
+ )
126
+
127
+ def process_file(self, file):
128
+ """
129
+ Process the uploaded PDF file and initialize necessary components: Tokenizer, VectorDB and LLM.
130
+
131
+ Parameters:
132
+ file (FileStorage): The uploaded PDF file.
133
+ """
134
+ self.create_prompt_template()
135
+ self.documents = PyPDFLoader(file.name).load()
136
+ self.load_embeddings()
137
+ self.load_vectordb()
138
+ self.load_tokenizer()
139
+ self.load_model()
140
+ self.create_pipeline()
141
+ self.create_chain()
142
+
143
+ def generate_response(self, history, query, file):
144
+ """
145
+ Generate a response based on user query and chat history.
146
+
147
+ Parameters:
148
+ history (list): List of chat history tuples.
149
+ query (str): User's query.
150
+ file (FileStorage): The uploaded PDF file.
151
+
152
+ Returns:
153
+ tuple: Updated chat history and a space.
154
+ """
155
+ if not query:
156
+ raise gr.Error(message='Submit a question')
157
+ if not file:
158
+ raise gr.Error(message='Upload a PDF')
159
+ if not self.processed:
160
+ self.process_file(file)
161
+ self.processed = True
162
+
163
+ result = self.chain({"question": query, 'chat_history': self.chat_history}, return_only_outputs=True)
164
+ self.chat_history.append((query, result["answer"]))
165
+ self.page = list(result['source_documents'][0])[1][1]['page']
166
+
167
+ for char in result['answer']:
168
+ history[-1][-1] += char
169
+ return history, " "
170
+
171
+ def render_file(self, file):
172
+ """
173
+ Renders a specific page of a PDF file as an image.
174
+
175
+ Parameters:
176
+ file (FileStorage): The PDF file.
177
+
178
+ Returns:
179
+ PIL.Image.Image: The rendered page as an image.
180
+ """
181
+ doc = fitz.open(file.name)
182
+ page = doc[self.page]
183
+ pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
184
+ image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
185
+ return image
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF==1.23.17
2
+ gradio==4.11.0
3
+ langchain==0.0.321
4
+ langchain-weaviate==0.0.2
5
+ Pillow==10.1.0
6
+ torch==2.1.1
7
+ transformers==4.35.2
8
+ PyYAML==6.0.1
9
+ weaviate-client==4.6.3
10
+ pypdf==4.0.0
11
+ Jinja2==3.1.3
12
+ accelerate==0.26.1
13
+ sentence-transformers==2.2.2
14
+ pydantic-settings==2.2.1
15
+ tiktoken==0.7.0