arslan-ahmed commited on
Commit
eb42e7a
1 Parent(s): f14681d

Optimization

Browse files
Files changed (5) hide show
  1. .gitignore +2 -1
  2. README.md +2 -2
  3. app.py +15 -17
  4. ttyd_consts.py +20 -3
  5. ttyd_functions.py +23 -6
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  .env
2
  __pycache__
3
  documents
4
- vecstore
 
 
1
  .env
2
  __pycache__
3
  documents
4
+ vecstore
5
+ gDriveDocs
README.md CHANGED
@@ -34,8 +34,8 @@ docker run --rm -d -p 7860:7860 --env-file ./.env arslan2k12/ttyd_arslanbot
34
 
35
  Contents of `.env` file:
36
  ```
37
- TTYD_MODE=personalBot_john
38
- # replace john with your name - use only small alphabets, no special characters
39
 
40
  GDRIVE_FOLDER_URL=https://drive.google.com/drive/folders/1ce1n1kleS1FOotdcu5joXeSRu_xnHjDt
41
  # replace with your Google Drive folder URL that has all your knowledge base files (.pdf, .docs, .txt) - make sure this folder is publicly accessible (everyone with the link)
 
34
 
35
  Contents of `.env` file:
36
  ```
37
+ TTYD_MODE=personalBot_John
38
+ #replace John with your name - use only alphabets, no special characters
39
 
40
  GDRIVE_FOLDER_URL=https://drive.google.com/drive/folders/1ce1n1kleS1FOotdcu5joXeSRu_xnHjDt
41
  # replace with your Google Drive folder URL that has all your knowledge base files (.pdf, .docs, .txt) - make sure this folder is publicly accessible (everyone with the link)
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import gdown
2
  from dotenv import load_dotenv
3
  import datetime
4
  import openai
@@ -38,16 +37,16 @@ from ttyd_consts import *
38
  ###############################################################################################
39
 
40
  load_dotenv()
 
 
41
 
42
  # select the mode when starting container - modes options are in ttyd_consts.py
43
- if (os.getenv("TTYD_MODE",'')).split('_')[0]=='personalBot':
44
  mode = mode_arslan
45
- gDriveUrl = (os.getenv("GDRIVE_FOLDER_URL",'')).replace('?usp=sharing','')
46
- # output folder of googe drive folder will be taken as input dir of personalBot
47
- gdown.download_folder(url=gDriveUrl, output=mode.inputDir, quiet=True)
48
- if os.getenv("TTYD_MODE",'')!='personalBot_arslan':
49
- mode.title=''
50
- mode.welcomeMsg=''
51
 
52
  elif os.getenv("TTYD_MODE",'')=='nustian':
53
  mode = mode_nustian
@@ -57,7 +56,7 @@ else:
57
 
58
  if mode.type!='userInputDocs':
59
  # local vector store as opposed to gradio state vector store, if we the user is not uploading the docs
60
- vsDict_hard = localData_vecStore(getPersonalBotApiKey(), inputDir=mode.inputDir, file_list=mode.file_list, url_list=mode.url_list)
61
 
62
  ###############################################################################################
63
 
@@ -133,9 +132,9 @@ def uiData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.P
133
  return vsDict_st, src_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')
134
 
135
  # initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
136
- def initializeChatbot(temp, k, modelName, stdlQs, api_key_st, vsDict_st, progress=gr.Progress()):
137
  progress(0.1, waitText_initialize)
138
- chainTuple = updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st)
139
  qa_chain_st = chainTuple[0]
140
  progress(0.5, waitText_initialize)
141
  #generate welcome message
@@ -147,7 +146,7 @@ def initializeChatbot(temp, k, modelName, stdlQs, api_key_st, vsDict_st, progres
147
  print('Chatbot initialized at ', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
148
 
149
  return qa_chain_st, chainTuple[1], btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
150
- , status_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('Hi', welMsg)])
151
 
152
  # just update the QA Chain, no updates to any UI
153
  def updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st):
@@ -156,15 +155,15 @@ def updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st):
156
 
157
  if api_key_st['service']=='openai':
158
  if not 'openai' in modelNameDD:
159
- modelNameDD = 'gpt-3.5-turbo (openai)' # default model for openai
160
  llm = getOaiLlm(temp, modelNameDD, api_key_st)
161
  elif api_key_st['service']=='watsonx':
162
  if not 'watsonx' in modelNameDD:
163
- modelNameDD = 'meta-llama/llama-2-70b-chat (watsonx)' # default model for watsonx
164
  llm = getWxLlm(temp, modelNameDD, api_key_st)
165
  elif api_key_st['service']=='bam':
166
  if not 'bam' in modelNameDD:
167
- modelNameDD = 'meta-llama/llama-2-70b-chat (bam)' # default model for bam
168
  llm = getBamLlm(temp, modelNameDD, api_key_st)
169
  else:
170
  raise Exception('Error: Invalid or None Credentials')
@@ -261,8 +260,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
261
  temp_sld = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", info='Sampling temperature to use when calling LLM. Defaults to 0.7')
262
  k_sld = gr.Slider(minimum=1, maximum=10, step=1, value=mode.k, label="K", info='Number of relavant documents to return from Vector Store. Defaults to 4')
263
  model_dd = gr.Dropdown(label='Model Name'\
264
- , choices=model_dd_choices\
265
- , value=model_dd_choices[0], allow_custom_value=True\
266
  , info=model_dd_info)
267
  stdlQs_rb = gr.Radio(label='Standalone Question', info=stdlQs_rb_info\
268
  , type='index', value=stdlQs_rb_choices[1]\
 
 
1
  from dotenv import load_dotenv
2
  import datetime
3
  import openai
 
37
  ###############################################################################################
38
 
39
  load_dotenv()
40
+ TTYD_MODE = os.getenv("TTYD_MODE",'')
41
+
42
 
43
  # select the mode when starting container - modes options are in ttyd_consts.py
44
+ if TTYD_MODE.split('_')[0]=='personalBot':
45
  mode = mode_arslan
46
+ if TTYD_MODE!='personalBot_arslan':
47
+ user = TTYD_MODE.split('_')[1]
48
+ mode.title='## Talk to '+user
49
+ mode.welcomeMsg= welcomeMsgUser(user)
 
 
50
 
51
  elif os.getenv("TTYD_MODE",'')=='nustian':
52
  mode = mode_nustian
 
56
 
57
  if mode.type!='userInputDocs':
58
  # local vector store as opposed to gradio state vector store, if we the user is not uploading the docs
59
+ vsDict_hard = localData_vecStore(getPersonalBotApiKey(), inputDir=mode.inputDir, file_list=mode.file_list, url_list=mode.url_list, gGrUrl=mode.gDriveFolder)
60
 
61
  ###############################################################################################
62
 
 
132
  return vsDict_st, src_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')
133
 
134
  # initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
135
+ def initializeChatbot(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st, progress=gr.Progress()):
136
  progress(0.1, waitText_initialize)
137
+ chainTuple = updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st)
138
  qa_chain_st = chainTuple[0]
139
  progress(0.5, waitText_initialize)
140
  #generate welcome message
 
146
  print('Chatbot initialized at ', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
147
 
148
  return qa_chain_st, chainTuple[1], btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
149
+ , status_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('', welMsg)])
150
 
151
  # just update the QA Chain, no updates to any UI
152
  def updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st):
 
155
 
156
  if api_key_st['service']=='openai':
157
  if not 'openai' in modelNameDD:
158
+ modelNameDD = changeModel(modelNameDD, OaiDefaultModel)
159
  llm = getOaiLlm(temp, modelNameDD, api_key_st)
160
  elif api_key_st['service']=='watsonx':
161
  if not 'watsonx' in modelNameDD:
162
+ modelNameDD = changeModel(modelNameDD, WxDefaultModel)
163
  llm = getWxLlm(temp, modelNameDD, api_key_st)
164
  elif api_key_st['service']=='bam':
165
  if not 'bam' in modelNameDD:
166
+ modelNameDD = changeModel(modelNameDD, BamDefaultModel)
167
  llm = getBamLlm(temp, modelNameDD, api_key_st)
168
  else:
169
  raise Exception('Error: Invalid or None Credentials')
 
260
  temp_sld = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", info='Sampling temperature to use when calling LLM. Defaults to 0.7')
261
  k_sld = gr.Slider(minimum=1, maximum=10, step=1, value=mode.k, label="K", info='Number of relavant documents to return from Vector Store. Defaults to 4')
262
  model_dd = gr.Dropdown(label='Model Name'\
263
+ , choices=model_dd_choices, allow_custom_value=True\
 
264
  , info=model_dd_info)
265
  stdlQs_rb = gr.Radio(label='Standalone Question', info=stdlQs_rb_info\
266
  , type='index', value=stdlQs_rb_choices[1]\
ttyd_consts.py CHANGED
@@ -1,4 +1,7 @@
1
  from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
 
 
 
2
 
3
  exp_query = 'Generate top 5 questions that I can ask about this data. Questions should be very precise and short, ideally less than 10 words.'
4
 
@@ -54,10 +57,16 @@ bam_models = sorted(['bigscience/bloom',
54
  'bigcode/starcoder',
55
  'google/ul2'])
56
 
57
- model_dd_info = 'You can also input any OpenAI model name or BAM model ID.'
58
 
59
  model_dd_choices = ['gpt-3.5-turbo (openai)', 'gpt-3.5-turbo-16k (openai)', 'gpt-4 (openai)', 'text-davinci-003 (Legacy - openai)', 'text-curie-001 (Legacy - openai)', 'babbage-002 (openai)'] + [model.value+' (watsonx)' for model in ModelTypes] + [model + ' (bam)' for model in bam_models]
60
 
 
 
 
 
 
 
61
  url_tb_info = 'Upto 100 domain webpages will be crawled for each URL. You can also enter online PDF files.'
62
 
63
  url_tb_ph = 'https://example.com, https://another.com, https://anyremotedocument.pdf'
@@ -102,14 +111,22 @@ welcomeMsgArslan = """Summary: The document provides a comprehensive overview of
102
 
103
  welcomeMsgDefault = """Hello and welcome! I'm your personal data assistant. Ask me anything about your data and I'll try my best to answer."""
104
 
 
 
 
 
 
 
 
105
  class TtydMode():
106
- def __init__(self, name='', title='', type='', dir=None, files=[], urls=[], vis=False, welMsg='', def_k=4):
107
  self.name = name
108
  self.title = title # markdown title for the top display
109
  self.type = type # userInputDocs, fixedDocs, personalBot
110
  self.inputDir=dir
111
  self.file_list=files
112
  self.url_list=urls
 
113
  self.uiAddDataVis = vis # load data from user - this will be true for type = userInputDocs
114
  self.welcomeMsg = welMsg #welcome msg constant - if not provided LLM will generate it
115
  self.k = def_k # default k docs to retrieve
@@ -118,4 +135,4 @@ class TtydMode():
118
 
119
  mode_general = TtydMode(name='general', title=md_title_general, type='userInputDocs', vis=True)
120
  mode_nustian = TtydMode(name='nustian', title=md_title_nustian, type='fixedDocs', urls=['https://nustian.ca'])
121
- mode_arslan = TtydMode(name='arslan', title=md_title_arslan, type='personalBot', dir='./documents/', welMsg=welcomeMsgArslan, def_k=8)
 
1
  from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
2
+ import os
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
 
6
  exp_query = 'Generate top 5 questions that I can ask about this data. Questions should be very precise and short, ideally less than 10 words.'
7
 
 
57
  'bigcode/starcoder',
58
  'google/ul2'])
59
 
60
+ model_dd_info = 'Make sure your credentials are submitted before changing the model. You can also input any OpenAI model name or Watsonx/BAM model ID.'
61
 
62
  model_dd_choices = ['gpt-3.5-turbo (openai)', 'gpt-3.5-turbo-16k (openai)', 'gpt-4 (openai)', 'text-davinci-003 (Legacy - openai)', 'text-curie-001 (Legacy - openai)', 'babbage-002 (openai)'] + [model.value+' (watsonx)' for model in ModelTypes] + [model + ' (bam)' for model in bam_models]
63
 
64
+
65
+ OaiDefaultModel = 'gpt-3.5-turbo (openai)'
66
+ WxDefaultModel = 'meta-llama/llama-2-70b-chat (watsonx)'
67
+ BamDefaultModel = 'meta-llama/llama-2-70b-chat (bam)'
68
+
69
+
70
  url_tb_info = 'Upto 100 domain webpages will be crawled for each URL. You can also enter online PDF files.'
71
 
72
  url_tb_ph = 'https://example.com, https://another.com, https://anyremotedocument.pdf'
 
111
 
112
  welcomeMsgDefault = """Hello and welcome! I'm your personal data assistant. Ask me anything about your data and I'll try my best to answer."""
113
 
114
+
115
+ def welcomeMsgUser(user):
116
+ return f"""Hi, Welcome to personal chatbot of {user}. I am trained on the documents {user} has provided me. Ask me anything about {user} and I'll try my best to answer."""
117
+
118
+
119
+ gDrFolder=(os.getenv("GDRIVE_FOLDER_URL",'')).replace('?usp=sharing','')
120
+
121
  class TtydMode():
122
+ def __init__(self, name='', title='', type='', dir=None, files=[], urls=[], vis=False, welMsg='', def_k=4, gDrFolder=''):
123
  self.name = name
124
  self.title = title # markdown title for the top display
125
  self.type = type # userInputDocs, fixedDocs, personalBot
126
  self.inputDir=dir
127
  self.file_list=files
128
  self.url_list=urls
129
+ self.gDriveFolder=gDrFolder
130
  self.uiAddDataVis = vis # load data from user - this will be true for type = userInputDocs
131
  self.welcomeMsg = welMsg #welcome msg constant - if not provided LLM will generate it
132
  self.k = def_k # default k docs to retrieve
 
135
 
136
  mode_general = TtydMode(name='general', title=md_title_general, type='userInputDocs', vis=True)
137
  mode_nustian = TtydMode(name='nustian', title=md_title_nustian, type='fixedDocs', urls=['https://nustian.ca'])
138
+ mode_arslan = TtydMode(name='arslan', title=md_title_arslan, type='personalBot', welMsg=welcomeMsgArslan, def_k=8, gDrFolder=gDrFolder)
ttyd_functions.py CHANGED
@@ -1,5 +1,7 @@
1
 
2
  import datetime
 
 
3
  import uuid
4
  import openai
5
  from langchain.embeddings import OpenAIEmbeddings
@@ -19,6 +21,7 @@ from urllib.parse import urlparse
19
  import mimetypes
20
  from pathlib import Path
21
  import tiktoken
 
22
 
23
  from langchain.chat_models import ChatOpenAI
24
  from langchain import OpenAI
@@ -67,6 +70,8 @@ def getPersonalBotApiKey():
67
  return getOaiCreds(os.getenv("OPENAI_API_KEY"))
68
  elif os.getenv("WX_API_KEY") and os.getenv("WX_PROJECT_ID"):
69
  return getWxCreds(os.getenv("WX_API_KEY"), os.getenv("WX_PROJECT_ID"))
 
 
70
  else:
71
  return {}
72
 
@@ -240,24 +245,30 @@ def ingestFiles(documents, files_list, prog=None):
240
  pass
241
 
242
  if doc is not None and doc[0].page_content:
243
- if prog is not None: prog(1, desc='Loaded file: '+fPath.rsplit('/')[0])
244
  print('Loaded file:', fPath)
245
  documents.extend(doc)
246
  return documents
247
 
248
 
249
- def data_ingestion(inputDir=None, file_list=[], url_list=[], prog=None):
250
  documents = []
 
 
 
 
 
 
251
  # Ingestion from Input Directory
252
  if inputDir is not None:
253
  files = [str(x) for x in Path(inputDir).glob('**/*')]
254
- documents = ingestFiles(documents, files)
255
  if file_list:
256
  documents = ingestFiles(documents, file_list, prog)
257
  # Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader
258
  if url_list:
259
  for url in url_list:
260
- documents = ingestURL(documents, url, prog=prog)
261
 
262
  # Cleanup documents
263
  for x in documents:
@@ -331,8 +342,8 @@ def getVsDict(embeddingFunc, docs, vsDict={}):
331
  return vsDict
332
 
333
  # used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function)
334
- def localData_vecStore(embKey={}, inputDir=None, file_list=[], url_list=[], vsDict={}):
335
- documents = data_ingestion(inputDir, file_list, url_list)
336
  if not documents:
337
  raise Exception('Error: No Documents Found')
338
  docs = split_docs(documents)
@@ -353,3 +364,9 @@ def num_tokens_from_string(string, encoding_name = "cl100k_base"):
353
  num_tokens = len(encoding.encode(string))
354
  return num_tokens
355
 
 
 
 
 
 
 
 
1
 
2
  import datetime
3
+ import gradio as gr
4
+ import time
5
  import uuid
6
  import openai
7
  from langchain.embeddings import OpenAIEmbeddings
 
21
  import mimetypes
22
  from pathlib import Path
23
  import tiktoken
24
+ import gdown
25
 
26
  from langchain.chat_models import ChatOpenAI
27
  from langchain import OpenAI
 
70
  return getOaiCreds(os.getenv("OPENAI_API_KEY"))
71
  elif os.getenv("WX_API_KEY") and os.getenv("WX_PROJECT_ID"):
72
  return getWxCreds(os.getenv("WX_API_KEY"), os.getenv("WX_PROJECT_ID"))
73
+ elif os.getenv("BAM_API_KEY"):
74
+ return getBamCreds(os.getenv("BAM_API_KEY"))
75
  else:
76
  return {}
77
 
 
245
  pass
246
 
247
  if doc is not None and doc[0].page_content:
248
+ if prog is not None: prog(0.9, desc='Loaded file: '+fPath.rsplit('/')[0])
249
  print('Loaded file:', fPath)
250
  documents.extend(doc)
251
  return documents
252
 
253
 
254
+ def data_ingestion(inputDir=None, file_list=[], url_list=[], gDriveFolder='', prog=None):
255
  documents = []
256
+ # Ingestion from Google Drive Folder
257
+ if gDriveFolder:
258
+ opFolder = './gDriveDocs/'
259
+ gdown.download_folder(url=gDriveFolder, output=opFolder, quiet=True)
260
+ files = [str(x) for x in Path(opFolder).glob('**/*')]
261
+ documents = ingestFiles(documents, files, prog)
262
  # Ingestion from Input Directory
263
  if inputDir is not None:
264
  files = [str(x) for x in Path(inputDir).glob('**/*')]
265
+ documents = ingestFiles(documents, files, prog)
266
  if file_list:
267
  documents = ingestFiles(documents, file_list, prog)
268
  # Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader
269
  if url_list:
270
  for url in url_list:
271
+ documents = ingestURL(documents, url, prog=prog)
272
 
273
  # Cleanup documents
274
  for x in documents:
 
342
  return vsDict
343
 
344
  # used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function)
345
+ def localData_vecStore(embKey={}, inputDir=None, file_list=[], url_list=[], vsDict={}, gGrUrl=''):
346
+ documents = data_ingestion(inputDir, file_list, url_list, gGrUrl)
347
  if not documents:
348
  raise Exception('Error: No Documents Found')
349
  docs = split_docs(documents)
 
364
  num_tokens = len(encoding.encode(string))
365
  return num_tokens
366
 
367
+ def changeModel(oldModel, newModel):
368
+ if oldModel:
369
+ warning = 'Credentials not found for '+oldModel+'. Using default model '+newModel
370
+ gr.Warning(warning)
371
+ time.sleep(1)
372
+ return newModel