Spaces:
Sleeping
Sleeping
arslan-ahmed
commited on
Commit
•
eb42e7a
1
Parent(s):
f14681d
Optimization
Browse files- .gitignore +2 -1
- README.md +2 -2
- app.py +15 -17
- ttyd_consts.py +20 -3
- ttyd_functions.py +23 -6
.gitignore
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
.env
|
2 |
__pycache__
|
3 |
documents
|
4 |
-
vecstore
|
|
|
|
1 |
.env
|
2 |
__pycache__
|
3 |
documents
|
4 |
+
vecstore
|
5 |
+
gDriveDocs
|
README.md
CHANGED
@@ -34,8 +34,8 @@ docker run --rm -d -p 7860:7860 --env-file ./.env arslan2k12/ttyd_arslanbot
|
|
34 |
|
35 |
Contents of `.env` file:
|
36 |
```
|
37 |
-
TTYD_MODE=
|
38 |
-
#
|
39 |
|
40 |
GDRIVE_FOLDER_URL=https://drive.google.com/drive/folders/1ce1n1kleS1FOotdcu5joXeSRu_xnHjDt
|
41 |
# replace with your Google Drive folder URL that has all your knowledge base files (.pdf, .docs, .txt) - make sure this folder is publicly accessible (everyone with the link)
|
|
|
34 |
|
35 |
Contents of `.env` file:
|
36 |
```
|
37 |
+
TTYD_MODE=personalBot_John
|
38 |
+
#replace John with your name - use only alphabets, no special characters
|
39 |
|
40 |
GDRIVE_FOLDER_URL=https://drive.google.com/drive/folders/1ce1n1kleS1FOotdcu5joXeSRu_xnHjDt
|
41 |
# replace with your Google Drive folder URL that has all your knowledge base files (.pdf, .docs, .txt) - make sure this folder is publicly accessible (everyone with the link)
|
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import gdown
|
2 |
from dotenv import load_dotenv
|
3 |
import datetime
|
4 |
import openai
|
@@ -38,16 +37,16 @@ from ttyd_consts import *
|
|
38 |
###############################################################################################
|
39 |
|
40 |
load_dotenv()
|
|
|
|
|
41 |
|
42 |
# select the mode when starting container - modes options are in ttyd_consts.py
|
43 |
-
if
|
44 |
mode = mode_arslan
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
mode.title=''
|
50 |
-
mode.welcomeMsg=''
|
51 |
|
52 |
elif os.getenv("TTYD_MODE",'')=='nustian':
|
53 |
mode = mode_nustian
|
@@ -57,7 +56,7 @@ else:
|
|
57 |
|
58 |
if mode.type!='userInputDocs':
|
59 |
# local vector store as opposed to gradio state vector store, if we the user is not uploading the docs
|
60 |
-
vsDict_hard = localData_vecStore(getPersonalBotApiKey(), inputDir=mode.inputDir, file_list=mode.file_list, url_list=mode.url_list)
|
61 |
|
62 |
###############################################################################################
|
63 |
|
@@ -133,9 +132,9 @@ def uiData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.P
|
|
133 |
return vsDict_st, src_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')
|
134 |
|
135 |
# initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
|
136 |
-
def initializeChatbot(temp, k,
|
137 |
progress(0.1, waitText_initialize)
|
138 |
-
chainTuple = updateQaChain(temp, k,
|
139 |
qa_chain_st = chainTuple[0]
|
140 |
progress(0.5, waitText_initialize)
|
141 |
#generate welcome message
|
@@ -147,7 +146,7 @@ def initializeChatbot(temp, k, modelName, stdlQs, api_key_st, vsDict_st, progres
|
|
147 |
print('Chatbot initialized at ', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
148 |
|
149 |
return qa_chain_st, chainTuple[1], btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
|
150 |
-
, status_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('
|
151 |
|
152 |
# just update the QA Chain, no updates to any UI
|
153 |
def updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st):
|
@@ -156,15 +155,15 @@ def updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st):
|
|
156 |
|
157 |
if api_key_st['service']=='openai':
|
158 |
if not 'openai' in modelNameDD:
|
159 |
-
modelNameDD =
|
160 |
llm = getOaiLlm(temp, modelNameDD, api_key_st)
|
161 |
elif api_key_st['service']=='watsonx':
|
162 |
if not 'watsonx' in modelNameDD:
|
163 |
-
modelNameDD =
|
164 |
llm = getWxLlm(temp, modelNameDD, api_key_st)
|
165 |
elif api_key_st['service']=='bam':
|
166 |
if not 'bam' in modelNameDD:
|
167 |
-
modelNameDD =
|
168 |
llm = getBamLlm(temp, modelNameDD, api_key_st)
|
169 |
else:
|
170 |
raise Exception('Error: Invalid or None Credentials')
|
@@ -261,8 +260,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
|
|
261 |
temp_sld = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", info='Sampling temperature to use when calling LLM. Defaults to 0.7')
|
262 |
k_sld = gr.Slider(minimum=1, maximum=10, step=1, value=mode.k, label="K", info='Number of relavant documents to return from Vector Store. Defaults to 4')
|
263 |
model_dd = gr.Dropdown(label='Model Name'\
|
264 |
-
, choices=model_dd_choices\
|
265 |
-
, value=model_dd_choices[0], allow_custom_value=True\
|
266 |
, info=model_dd_info)
|
267 |
stdlQs_rb = gr.Radio(label='Standalone Question', info=stdlQs_rb_info\
|
268 |
, type='index', value=stdlQs_rb_choices[1]\
|
|
|
|
|
1 |
from dotenv import load_dotenv
|
2 |
import datetime
|
3 |
import openai
|
|
|
37 |
###############################################################################################
|
38 |
|
39 |
load_dotenv()
|
40 |
+
TTYD_MODE = os.getenv("TTYD_MODE",'')
|
41 |
+
|
42 |
|
43 |
# select the mode when starting container - modes options are in ttyd_consts.py
|
44 |
+
if TTYD_MODE.split('_')[0]=='personalBot':
|
45 |
mode = mode_arslan
|
46 |
+
if TTYD_MODE!='personalBot_arslan':
|
47 |
+
user = TTYD_MODE.split('_')[1]
|
48 |
+
mode.title='## Talk to '+user
|
49 |
+
mode.welcomeMsg= welcomeMsgUser(user)
|
|
|
|
|
50 |
|
51 |
elif os.getenv("TTYD_MODE",'')=='nustian':
|
52 |
mode = mode_nustian
|
|
|
56 |
|
57 |
if mode.type!='userInputDocs':
|
58 |
# local vector store as opposed to gradio state vector store, if we the user is not uploading the docs
|
59 |
+
vsDict_hard = localData_vecStore(getPersonalBotApiKey(), inputDir=mode.inputDir, file_list=mode.file_list, url_list=mode.url_list, gGrUrl=mode.gDriveFolder)
|
60 |
|
61 |
###############################################################################################
|
62 |
|
|
|
132 |
return vsDict_st, src_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')
|
133 |
|
134 |
# initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
|
135 |
+
def initializeChatbot(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st, progress=gr.Progress()):
|
136 |
progress(0.1, waitText_initialize)
|
137 |
+
chainTuple = updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st)
|
138 |
qa_chain_st = chainTuple[0]
|
139 |
progress(0.5, waitText_initialize)
|
140 |
#generate welcome message
|
|
|
146 |
print('Chatbot initialized at ', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
147 |
|
148 |
return qa_chain_st, chainTuple[1], btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
|
149 |
+
, status_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('', welMsg)])
|
150 |
|
151 |
# just update the QA Chain, no updates to any UI
|
152 |
def updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st):
|
|
|
155 |
|
156 |
if api_key_st['service']=='openai':
|
157 |
if not 'openai' in modelNameDD:
|
158 |
+
modelNameDD = changeModel(modelNameDD, OaiDefaultModel)
|
159 |
llm = getOaiLlm(temp, modelNameDD, api_key_st)
|
160 |
elif api_key_st['service']=='watsonx':
|
161 |
if not 'watsonx' in modelNameDD:
|
162 |
+
modelNameDD = changeModel(modelNameDD, WxDefaultModel)
|
163 |
llm = getWxLlm(temp, modelNameDD, api_key_st)
|
164 |
elif api_key_st['service']=='bam':
|
165 |
if not 'bam' in modelNameDD:
|
166 |
+
modelNameDD = changeModel(modelNameDD, BamDefaultModel)
|
167 |
llm = getBamLlm(temp, modelNameDD, api_key_st)
|
168 |
else:
|
169 |
raise Exception('Error: Invalid or None Credentials')
|
|
|
260 |
temp_sld = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", info='Sampling temperature to use when calling LLM. Defaults to 0.7')
|
261 |
k_sld = gr.Slider(minimum=1, maximum=10, step=1, value=mode.k, label="K", info='Number of relavant documents to return from Vector Store. Defaults to 4')
|
262 |
model_dd = gr.Dropdown(label='Model Name'\
|
263 |
+
, choices=model_dd_choices, allow_custom_value=True\
|
|
|
264 |
, info=model_dd_info)
|
265 |
stdlQs_rb = gr.Radio(label='Standalone Question', info=stdlQs_rb_info\
|
266 |
, type='index', value=stdlQs_rb_choices[1]\
|
ttyd_consts.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
|
|
|
|
|
|
|
2 |
|
3 |
exp_query = 'Generate top 5 questions that I can ask about this data. Questions should be very precise and short, ideally less than 10 words.'
|
4 |
|
@@ -54,10 +57,16 @@ bam_models = sorted(['bigscience/bloom',
|
|
54 |
'bigcode/starcoder',
|
55 |
'google/ul2'])
|
56 |
|
57 |
-
model_dd_info = 'You can also input any OpenAI model name or BAM model ID.'
|
58 |
|
59 |
model_dd_choices = ['gpt-3.5-turbo (openai)', 'gpt-3.5-turbo-16k (openai)', 'gpt-4 (openai)', 'text-davinci-003 (Legacy - openai)', 'text-curie-001 (Legacy - openai)', 'babbage-002 (openai)'] + [model.value+' (watsonx)' for model in ModelTypes] + [model + ' (bam)' for model in bam_models]
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
url_tb_info = 'Upto 100 domain webpages will be crawled for each URL. You can also enter online PDF files.'
|
62 |
|
63 |
url_tb_ph = 'https://example.com, https://another.com, https://anyremotedocument.pdf'
|
@@ -102,14 +111,22 @@ welcomeMsgArslan = """Summary: The document provides a comprehensive overview of
|
|
102 |
|
103 |
welcomeMsgDefault = """Hello and welcome! I'm your personal data assistant. Ask me anything about your data and I'll try my best to answer."""
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
class TtydMode():
|
106 |
-
def __init__(self, name='', title='', type='', dir=None, files=[], urls=[], vis=False, welMsg='', def_k=4):
|
107 |
self.name = name
|
108 |
self.title = title # markdown title for the top display
|
109 |
self.type = type # userInputDocs, fixedDocs, personalBot
|
110 |
self.inputDir=dir
|
111 |
self.file_list=files
|
112 |
self.url_list=urls
|
|
|
113 |
self.uiAddDataVis = vis # load data from user - this will be true for type = userInputDocs
|
114 |
self.welcomeMsg = welMsg #welcome msg constant - if not provided LLM will generate it
|
115 |
self.k = def_k # default k docs to retrieve
|
@@ -118,4 +135,4 @@ class TtydMode():
|
|
118 |
|
119 |
mode_general = TtydMode(name='general', title=md_title_general, type='userInputDocs', vis=True)
|
120 |
mode_nustian = TtydMode(name='nustian', title=md_title_nustian, type='fixedDocs', urls=['https://nustian.ca'])
|
121 |
-
mode_arslan = TtydMode(name='arslan', title=md_title_arslan, type='personalBot',
|
|
|
1 |
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
load_dotenv()
|
5 |
|
6 |
exp_query = 'Generate top 5 questions that I can ask about this data. Questions should be very precise and short, ideally less than 10 words.'
|
7 |
|
|
|
57 |
'bigcode/starcoder',
|
58 |
'google/ul2'])
|
59 |
|
60 |
+
model_dd_info = 'Make sure your credentials are submitted before changing the model. You can also input any OpenAI model name or Watsonx/BAM model ID.'
|
61 |
|
62 |
model_dd_choices = ['gpt-3.5-turbo (openai)', 'gpt-3.5-turbo-16k (openai)', 'gpt-4 (openai)', 'text-davinci-003 (Legacy - openai)', 'text-curie-001 (Legacy - openai)', 'babbage-002 (openai)'] + [model.value+' (watsonx)' for model in ModelTypes] + [model + ' (bam)' for model in bam_models]
|
63 |
|
64 |
+
|
65 |
+
OaiDefaultModel = 'gpt-3.5-turbo (openai)'
|
66 |
+
WxDefaultModel = 'meta-llama/llama-2-70b-chat (watsonx)'
|
67 |
+
BamDefaultModel = 'meta-llama/llama-2-70b-chat (bam)'
|
68 |
+
|
69 |
+
|
70 |
url_tb_info = 'Upto 100 domain webpages will be crawled for each URL. You can also enter online PDF files.'
|
71 |
|
72 |
url_tb_ph = 'https://example.com, https://another.com, https://anyremotedocument.pdf'
|
|
|
111 |
|
112 |
welcomeMsgDefault = """Hello and welcome! I'm your personal data assistant. Ask me anything about your data and I'll try my best to answer."""
|
113 |
|
114 |
+
|
115 |
+
def welcomeMsgUser(user):
|
116 |
+
return f"""Hi, Welcome to personal chatbot of {user}. I am trained on the documents {user} has provided me. Ask me anything about {user} and I'll try my best to answer."""
|
117 |
+
|
118 |
+
|
119 |
+
gDrFolder=(os.getenv("GDRIVE_FOLDER_URL",'')).replace('?usp=sharing','')
|
120 |
+
|
121 |
class TtydMode():
|
122 |
+
def __init__(self, name='', title='', type='', dir=None, files=[], urls=[], vis=False, welMsg='', def_k=4, gDrFolder=''):
|
123 |
self.name = name
|
124 |
self.title = title # markdown title for the top display
|
125 |
self.type = type # userInputDocs, fixedDocs, personalBot
|
126 |
self.inputDir=dir
|
127 |
self.file_list=files
|
128 |
self.url_list=urls
|
129 |
+
self.gDriveFolder=gDrFolder
|
130 |
self.uiAddDataVis = vis # load data from user - this will be true for type = userInputDocs
|
131 |
self.welcomeMsg = welMsg #welcome msg constant - if not provided LLM will generate it
|
132 |
self.k = def_k # default k docs to retrieve
|
|
|
135 |
|
136 |
mode_general = TtydMode(name='general', title=md_title_general, type='userInputDocs', vis=True)
|
137 |
mode_nustian = TtydMode(name='nustian', title=md_title_nustian, type='fixedDocs', urls=['https://nustian.ca'])
|
138 |
+
mode_arslan = TtydMode(name='arslan', title=md_title_arslan, type='personalBot', welMsg=welcomeMsgArslan, def_k=8, gDrFolder=gDrFolder)
|
ttyd_functions.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
|
2 |
import datetime
|
|
|
|
|
3 |
import uuid
|
4 |
import openai
|
5 |
from langchain.embeddings import OpenAIEmbeddings
|
@@ -19,6 +21,7 @@ from urllib.parse import urlparse
|
|
19 |
import mimetypes
|
20 |
from pathlib import Path
|
21 |
import tiktoken
|
|
|
22 |
|
23 |
from langchain.chat_models import ChatOpenAI
|
24 |
from langchain import OpenAI
|
@@ -67,6 +70,8 @@ def getPersonalBotApiKey():
|
|
67 |
return getOaiCreds(os.getenv("OPENAI_API_KEY"))
|
68 |
elif os.getenv("WX_API_KEY") and os.getenv("WX_PROJECT_ID"):
|
69 |
return getWxCreds(os.getenv("WX_API_KEY"), os.getenv("WX_PROJECT_ID"))
|
|
|
|
|
70 |
else:
|
71 |
return {}
|
72 |
|
@@ -240,24 +245,30 @@ def ingestFiles(documents, files_list, prog=None):
|
|
240 |
pass
|
241 |
|
242 |
if doc is not None and doc[0].page_content:
|
243 |
-
if prog is not None: prog(
|
244 |
print('Loaded file:', fPath)
|
245 |
documents.extend(doc)
|
246 |
return documents
|
247 |
|
248 |
|
249 |
-
def data_ingestion(inputDir=None, file_list=[], url_list=[], prog=None):
|
250 |
documents = []
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
# Ingestion from Input Directory
|
252 |
if inputDir is not None:
|
253 |
files = [str(x) for x in Path(inputDir).glob('**/*')]
|
254 |
-
documents = ingestFiles(documents, files)
|
255 |
if file_list:
|
256 |
documents = ingestFiles(documents, file_list, prog)
|
257 |
# Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader
|
258 |
if url_list:
|
259 |
for url in url_list:
|
260 |
-
documents = ingestURL(documents, url, prog=prog)
|
261 |
|
262 |
# Cleanup documents
|
263 |
for x in documents:
|
@@ -331,8 +342,8 @@ def getVsDict(embeddingFunc, docs, vsDict={}):
|
|
331 |
return vsDict
|
332 |
|
333 |
# used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function)
|
334 |
-
def localData_vecStore(embKey={}, inputDir=None, file_list=[], url_list=[], vsDict={}):
|
335 |
-
documents = data_ingestion(inputDir, file_list, url_list)
|
336 |
if not documents:
|
337 |
raise Exception('Error: No Documents Found')
|
338 |
docs = split_docs(documents)
|
@@ -353,3 +364,9 @@ def num_tokens_from_string(string, encoding_name = "cl100k_base"):
|
|
353 |
num_tokens = len(encoding.encode(string))
|
354 |
return num_tokens
|
355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
import datetime
|
3 |
+
import gradio as gr
|
4 |
+
import time
|
5 |
import uuid
|
6 |
import openai
|
7 |
from langchain.embeddings import OpenAIEmbeddings
|
|
|
21 |
import mimetypes
|
22 |
from pathlib import Path
|
23 |
import tiktoken
|
24 |
+
import gdown
|
25 |
|
26 |
from langchain.chat_models import ChatOpenAI
|
27 |
from langchain import OpenAI
|
|
|
70 |
return getOaiCreds(os.getenv("OPENAI_API_KEY"))
|
71 |
elif os.getenv("WX_API_KEY") and os.getenv("WX_PROJECT_ID"):
|
72 |
return getWxCreds(os.getenv("WX_API_KEY"), os.getenv("WX_PROJECT_ID"))
|
73 |
+
elif os.getenv("BAM_API_KEY"):
|
74 |
+
return getBamCreds(os.getenv("BAM_API_KEY"))
|
75 |
else:
|
76 |
return {}
|
77 |
|
|
|
245 |
pass
|
246 |
|
247 |
if doc is not None and doc[0].page_content:
|
248 |
+
if prog is not None: prog(0.9, desc='Loaded file: '+fPath.rsplit('/')[0])
|
249 |
print('Loaded file:', fPath)
|
250 |
documents.extend(doc)
|
251 |
return documents
|
252 |
|
253 |
|
254 |
+
def data_ingestion(inputDir=None, file_list=[], url_list=[], gDriveFolder='', prog=None):
|
255 |
documents = []
|
256 |
+
# Ingestion from Google Drive Folder
|
257 |
+
if gDriveFolder:
|
258 |
+
opFolder = './gDriveDocs/'
|
259 |
+
gdown.download_folder(url=gDriveFolder, output=opFolder, quiet=True)
|
260 |
+
files = [str(x) for x in Path(opFolder).glob('**/*')]
|
261 |
+
documents = ingestFiles(documents, files, prog)
|
262 |
# Ingestion from Input Directory
|
263 |
if inputDir is not None:
|
264 |
files = [str(x) for x in Path(inputDir).glob('**/*')]
|
265 |
+
documents = ingestFiles(documents, files, prog)
|
266 |
if file_list:
|
267 |
documents = ingestFiles(documents, file_list, prog)
|
268 |
# Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader
|
269 |
if url_list:
|
270 |
for url in url_list:
|
271 |
+
documents = ingestURL(documents, url, prog=prog)
|
272 |
|
273 |
# Cleanup documents
|
274 |
for x in documents:
|
|
|
342 |
return vsDict
|
343 |
|
344 |
# used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function)
|
345 |
+
def localData_vecStore(embKey={}, inputDir=None, file_list=[], url_list=[], vsDict={}, gGrUrl=''):
|
346 |
+
documents = data_ingestion(inputDir, file_list, url_list, gGrUrl)
|
347 |
if not documents:
|
348 |
raise Exception('Error: No Documents Found')
|
349 |
docs = split_docs(documents)
|
|
|
364 |
num_tokens = len(encoding.encode(string))
|
365 |
return num_tokens
|
366 |
|
367 |
+
def changeModel(oldModel, newModel):
|
368 |
+
if oldModel:
|
369 |
+
warning = 'Credentials not found for '+oldModel+'. Using default model '+newModel
|
370 |
+
gr.Warning(warning)
|
371 |
+
time.sleep(1)
|
372 |
+
return newModel
|