dh-mc commited on
Commit
2826548
1 Parent(s): e182c41

fixed bug in gradio app

Browse files
app.py CHANGED
@@ -7,9 +7,9 @@ from timeit import default_timer as timer
7
  import gradio as gr
8
  from anyio.from_thread import start_blocking_portal
9
  from app_modules.init import app_init
10
- from app_modules.utils import print_llm_response
11
 
12
- qa_chain = app_init()
13
 
14
  chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
15
  show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
@@ -17,9 +17,15 @@ share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
17
 
18
  using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
19
  model = (
20
- "OpenAI GPT-4" if using_openai else os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
 
 
 
 
 
 
 
21
  )
22
- href = "https://openai.com/gpt-4" if using_openai else f"https://huggingface.co/{model}"
23
 
24
  title = """<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with AI Books </h1>"""
25
 
@@ -75,7 +81,7 @@ def qa(chatbot):
75
  print("nothing generated yet - retry in 0.5s")
76
  time.sleep(0.5)
77
 
78
- for next_token in qa_chain.streamer:
79
  if next_token is job_done:
80
  break
81
  content += next_token or ""
 
7
  import gradio as gr
8
  from anyio.from_thread import start_blocking_portal
9
  from app_modules.init import app_init
10
+ from app_modules.utils import print_llm_response, remove_extra_spaces
11
 
12
+ llm_loader, qa_chain = app_init()
13
 
14
  chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
15
  show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
 
17
 
18
  using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
19
  model = (
20
+ "OpenAI GPT-3.5"
21
+ if using_openai
22
+ else os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
23
+ )
24
+ href = (
25
+ "https://platform.openai.com/docs/models/gpt-3-5"
26
+ if using_openai
27
+ else f"https://huggingface.co/{model}"
28
  )
 
29
 
30
  title = """<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with AI Books </h1>"""
31
 
 
81
  print("nothing generated yet - retry in 0.5s")
82
  time.sleep(0.5)
83
 
84
+ for next_token in llm_loader.streamer:
85
  if next_token is job_done:
86
  break
87
  content += next_token or ""
app_modules/init.py CHANGED
@@ -75,4 +75,4 @@ def app_init():
75
  end = timer()
76
  print(f"Completed in {end - start:.3f}s")
77
 
78
- return qa_chain
 
75
  end = timer()
76
  print(f"Completed in {end - start:.3f}s")
77
 
78
+ return llm_loader, qa_chain
app_modules/llm_chat_chain.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import ConversationalRetrievalChain
2
+ from langchain.chains.base import Chain
3
+ from langchain.memory import ConversationBufferMemory
4
+ from langchain import LLMChain, PromptTemplate
5
+ from app_modules.llm_inference import LLMInference
6
+
7
+
8
+ class ChatChain(LLMInference):
9
+ def __init__(self, llm_loader):
10
+ super().__init__(llm_loader)
11
+
12
+ def create_chain(self) -> Chain:
13
+ template = """You are a chatbot having a conversation with a human.
14
+ {chat_history}
15
+ Human: {question}
16
+ Chatbot:"""
17
+
18
+ prompt = PromptTemplate(
19
+ input_variables=["chat_history", "question"], template=template
20
+ )
21
+ memory = ConversationBufferMemory(memory_key="chat_history")
22
+
23
+ llm_chain = LLMChain(
24
+ llm=self.llm_loader.llm,
25
+ prompt=prompt,
26
+ verbose=True,
27
+ memory=memory,
28
+ )
29
+
30
+ return llm_chain
app_modules/llm_inference.py CHANGED
@@ -55,15 +55,16 @@ class LLMInference(metaclass=abc.ABCMeta):
55
  else chain(inputs)
56
  )
57
 
58
- result["answer"] = remove_extra_spaces(result["answer"])
59
-
60
- base_url = os.environ.get("PDF_FILE_BASE_URL")
61
- if base_url is not None and len(base_url) > 0:
62
- documents = result["source_documents"]
63
- for doc in documents:
64
- source = doc.metadata["source"]
65
- title = source.split("/")[-1]
66
- doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
 
67
 
68
  return result
69
 
 
55
  else chain(inputs)
56
  )
57
 
58
+ if "answer" in result:
59
+ result["answer"] = remove_extra_spaces(result["answer"])
60
+
61
+ base_url = os.environ.get("PDF_FILE_BASE_URL")
62
+ if base_url is not None and len(base_url) > 0:
63
+ documents = result["source_documents"]
64
+ for doc in documents:
65
+ source = doc.metadata["source"]
66
+ title = source.split("/")[-1]
67
+ doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
68
 
69
  return result
70
 
app_modules/llm_loader.py CHANGED
@@ -90,11 +90,11 @@ class LLMLoader:
90
  streamer: any
91
  max_tokens_limit: int
92
 
93
- def __init__(self, llm_model_type):
94
  self.llm_model_type = llm_model_type
95
  self.llm = None
96
  self.streamer = TextIteratorStreamer("")
97
- self.max_tokens_limit = 2048
98
  self.search_kwargs = {"k": 4}
99
 
100
  def _init_streamer(self, tokenizer, custom_handler):
 
90
  streamer: any
91
  max_tokens_limit: int
92
 
93
+ def __init__(self, llm_model_type, max_tokens_limit: int = 2048):
94
  self.llm_model_type = llm_model_type
95
  self.llm = None
96
  self.streamer = TextIteratorStreamer("")
97
+ self.max_tokens_limit = max_tokens_limit
98
  self.search_kwargs = {"k": 4}
99
 
100
  def _init_streamer(self, tokenizer, custom_handler):
app_modules/llm_qa_chain.py CHANGED
@@ -8,7 +8,7 @@ from app_modules.llm_inference import LLMInference
8
  class QAChain(LLMInference):
9
  vectorstore: VectorStore
10
 
11
- def __init__(self, vectorstore, llm_loader: int = 2048):
12
  super().__init__(llm_loader)
13
  self.vectorstore = vectorstore
14
 
 
8
  class QAChain(LLMInference):
9
  vectorstore: VectorStore
10
 
11
+ def __init__(self, vectorstore, llm_loader):
12
  super().__init__(llm_loader)
13
  self.vectorstore = vectorstore
14
 
app_modules/qa_chain.py DELETED
@@ -1,631 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import urllib
5
- from queue import Queue
6
- from threading import Thread
7
- from typing import Any, Optional
8
-
9
- import torch
10
- from langchain.callbacks.base import BaseCallbackHandler
11
- from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12
- from langchain.callbacks.tracers import LangChainTracer
13
- from langchain.chains import ConversationalRetrievalChain
14
- from langchain.chat_models import ChatOpenAI
15
- from langchain.llms import GPT4All, HuggingFacePipeline, LlamaCpp
16
- from langchain.schema import LLMResult
17
- from langchain.vectorstores import VectorStore
18
- from langchain.vectorstores.base import VectorStore
19
- from transformers import (
20
- AutoConfig,
21
- AutoModelForCausalLM,
22
- AutoModelForSeq2SeqLM,
23
- AutoTokenizer,
24
- BitsAndBytesConfig,
25
- StoppingCriteria,
26
- StoppingCriteriaList,
27
- T5Tokenizer,
28
- TextStreamer,
29
- pipeline,
30
- )
31
-
32
- from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
33
- from app_modules.utils import ensure_model_is_downloaded, remove_extra_spaces
34
-
35
-
36
- class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
37
- def __init__(
38
- self,
39
- tokenizer: "AutoTokenizer",
40
- skip_prompt: bool = False,
41
- timeout: Optional[float] = None,
42
- **decode_kwargs,
43
- ):
44
- super().__init__(tokenizer, skip_prompt, **decode_kwargs)
45
- self.text_queue = Queue()
46
- self.stop_signal = None
47
- self.timeout = timeout
48
-
49
- def on_finalized_text(self, text: str, stream_end: bool = False):
50
- super().on_finalized_text(text, stream_end=stream_end)
51
-
52
- """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
53
- self.text_queue.put(text, timeout=self.timeout)
54
- if stream_end:
55
- print("\n")
56
- self.text_queue.put("\n", timeout=self.timeout)
57
- self.text_queue.put(self.stop_signal, timeout=self.timeout)
58
-
59
- def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
60
- sys.stdout.write(token)
61
- sys.stdout.flush()
62
- self.text_queue.put(token, timeout=self.timeout)
63
-
64
- def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
65
- print("\n")
66
- self.text_queue.put("\n", timeout=self.timeout)
67
- self.text_queue.put(self.stop_signal, timeout=self.timeout)
68
-
69
- def __iter__(self):
70
- return self
71
-
72
- def __next__(self):
73
- value = self.text_queue.get(timeout=self.timeout)
74
- if value == self.stop_signal:
75
- raise StopIteration()
76
- else:
77
- return value
78
-
79
- def reset(self, q: Queue = None):
80
- # print("resetting TextIteratorStreamer")
81
- self.text_queue = q if q is not None else Queue()
82
-
83
- def empty(self):
84
- return self.text_queue.empty()
85
-
86
-
87
- class QAChain:
88
- llm_model_type: str
89
- vectorstore: VectorStore
90
- llm: any
91
- streamer: any
92
-
93
- def __init__(self, vectorstore, llm_model_type):
94
- self.vectorstore = vectorstore
95
- self.llm_model_type = llm_model_type
96
- self.llm = None
97
- self.streamer = TextIteratorStreamer("")
98
- self.max_tokens_limit = 2048
99
- self.search_kwargs = {"k": 4}
100
-
101
- def _init_streamer(self, tokenizer, custom_handler):
102
- self.streamer = (
103
- TextIteratorStreamer(
104
- tokenizer,
105
- timeout=10.0,
106
- skip_prompt=True,
107
- skip_special_tokens=True,
108
- )
109
- if custom_handler is None
110
- else TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
111
- )
112
-
113
- def init(
114
- self,
115
- custom_handler: Optional[BaseCallbackHandler] = None,
116
- n_threds: int = 4,
117
- hf_pipeline_device_type: str = None,
118
- ):
119
- print("initializing LLM: " + self.llm_model_type)
120
-
121
- if hf_pipeline_device_type is None:
122
- hf_pipeline_device_type = "cpu"
123
-
124
- using_cuda = hf_pipeline_device_type.startswith("cuda")
125
- torch_dtype = torch.float16 if using_cuda else torch.float32
126
- if os.environ.get("USING_TORCH_BFLOAT16") == "true":
127
- torch_dtype = torch.bfloat16
128
- load_quantized_model = os.environ.get("LOAD_QUANTIZED_MODEL")
129
-
130
- print(f" hf_pipeline_device_type: {hf_pipeline_device_type}")
131
- print(f" load_quantized_model: {load_quantized_model}")
132
- print(f" torch_dtype: {torch_dtype}")
133
- print(f" n_threds: {n_threds}")
134
-
135
- double_quant_config = BitsAndBytesConfig(
136
- load_in_4bit=load_quantized_model == "4bit",
137
- bnb_4bit_use_double_quant=load_quantized_model == "4bit",
138
- load_in_8bit=load_quantized_model == "8bit",
139
- bnb_8bit_use_double_quant=load_quantized_model == "8bit",
140
- )
141
-
142
- callbacks = [self.streamer]
143
- if custom_handler is not None:
144
- callbacks.append(custom_handler)
145
-
146
- if self.llm is None:
147
- if self.llm_model_type == "openai":
148
- MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-4"
149
- print(f" using model: {MODEL_NAME}")
150
- self.llm = ChatOpenAI(
151
- model_name=MODEL_NAME,
152
- streaming=True,
153
- callbacks=callbacks,
154
- verbose=True,
155
- temperature=0,
156
- )
157
- elif self.llm_model_type.startswith("gpt4all"):
158
- MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
159
- self.llm = GPT4All(
160
- model=MODEL_PATH,
161
- max_tokens=2048,
162
- n_threads=n_threds,
163
- backend="gptj" if self.llm_model_type == "gpt4all-j" else "llama",
164
- callbacks=callbacks,
165
- verbose=True,
166
- use_mlock=True,
167
- )
168
- elif self.llm_model_type == "llamacpp":
169
- MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
170
- self.llm = LlamaCpp(
171
- model_path=MODEL_PATH,
172
- n_ctx=8192,
173
- n_threads=n_threds,
174
- seed=0,
175
- temperature=0,
176
- max_tokens=2048,
177
- callbacks=callbacks,
178
- verbose=True,
179
- use_mlock=True,
180
- )
181
- elif self.llm_model_type.startswith("huggingface"):
182
- MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
183
- print(f" loading model: {MODEL_NAME_OR_PATH}")
184
-
185
- hf_auth_token = os.environ.get("HUGGINGFACE_AUTH_TOKEN")
186
- transformers_offline = os.environ.get("TRANSFORMERS_OFFLINE") == "1"
187
- token = (
188
- hf_auth_token
189
- if hf_auth_token is not None
190
- and len(hf_auth_token) > 0
191
- and not transformers_offline
192
- else None
193
- )
194
- print(f" HF auth token: {str(token)[-5:]}")
195
-
196
- is_t5 = "t5" in MODEL_NAME_OR_PATH
197
- temperature = (
198
- 0.01
199
- if "gpt4all-j" in MODEL_NAME_OR_PATH
200
- or "dolly" in MODEL_NAME_OR_PATH
201
- else 0
202
- )
203
- use_fast = (
204
- "stable" in MODEL_NAME_OR_PATH
205
- or "RedPajama" in MODEL_NAME_OR_PATH
206
- or "dolly" in MODEL_NAME_OR_PATH
207
- )
208
- padding_side = "left" # if "dolly" in MODEL_NAME_OR_PATH else None
209
-
210
- config = AutoConfig.from_pretrained(
211
- MODEL_NAME_OR_PATH,
212
- trust_remote_code=True,
213
- token=token,
214
- )
215
- # config.attn_config["attn_impl"] = "triton"
216
- # config.max_seq_len = 4096
217
- config.init_device = hf_pipeline_device_type
218
-
219
- tokenizer = (
220
- T5Tokenizer.from_pretrained(
221
- MODEL_NAME_OR_PATH,
222
- token=token,
223
- )
224
- if is_t5
225
- else AutoTokenizer.from_pretrained(
226
- MODEL_NAME_OR_PATH,
227
- use_fast=use_fast,
228
- trust_remote_code=True,
229
- padding_side=padding_side,
230
- token=token,
231
- )
232
- )
233
-
234
- self._init_streamer(tokenizer, custom_handler)
235
-
236
- task = "text2text-generation" if is_t5 else "text-generation"
237
-
238
- return_full_text = True if "dolly" in MODEL_NAME_OR_PATH else None
239
-
240
- repetition_penalty = (
241
- 1.15
242
- if "falcon" in MODEL_NAME_OR_PATH
243
- else (1.25 if "dolly" in MODEL_NAME_OR_PATH else 1.1)
244
- )
245
-
246
- if load_quantized_model is not None:
247
- model = (
248
- AutoModelForSeq2SeqLM.from_pretrained(
249
- MODEL_NAME_OR_PATH,
250
- config=config,
251
- quantization_config=double_quant_config,
252
- trust_remote_code=True,
253
- token=token,
254
- )
255
- if is_t5
256
- else AutoModelForCausalLM.from_pretrained(
257
- MODEL_NAME_OR_PATH,
258
- config=config,
259
- quantization_config=double_quant_config,
260
- trust_remote_code=True,
261
- token=token,
262
- )
263
- )
264
-
265
- print(f"Model memory footprint: {model.get_memory_footprint()}")
266
-
267
- eos_token_id = -1
268
- # starchat-beta uses a special <|end|> token with ID 49155 to denote ends of a turn
269
- if "starchat" in MODEL_NAME_OR_PATH:
270
- eos_token_id = 49155
271
- pad_token_id = eos_token_id
272
-
273
- pipe = (
274
- InstructionTextGenerationPipeline(
275
- task=task,
276
- model=model,
277
- tokenizer=tokenizer,
278
- streamer=self.streamer,
279
- max_new_tokens=2048,
280
- temperature=temperature,
281
- return_full_text=return_full_text, # langchain expects the full text
282
- repetition_penalty=repetition_penalty,
283
- )
284
- if "dolly" in MODEL_NAME_OR_PATH
285
- else (
286
- pipeline(
287
- task,
288
- model=model,
289
- tokenizer=tokenizer,
290
- eos_token_id=eos_token_id,
291
- pad_token_id=pad_token_id,
292
- streamer=self.streamer,
293
- return_full_text=return_full_text, # langchain expects the full text
294
- device_map="auto",
295
- trust_remote_code=True,
296
- max_new_tokens=2048,
297
- do_sample=True,
298
- temperature=0.01,
299
- top_p=0.95,
300
- top_k=50,
301
- repetition_penalty=repetition_penalty,
302
- )
303
- if eos_token_id != -1
304
- else pipeline(
305
- task,
306
- model=model,
307
- tokenizer=tokenizer,
308
- streamer=self.streamer,
309
- return_full_text=return_full_text, # langchain expects the full text
310
- device_map="auto",
311
- trust_remote_code=True,
312
- max_new_tokens=2048,
313
- # verbose=True,
314
- temperature=temperature,
315
- top_p=0.95,
316
- top_k=0, # select from top 0 tokens (because zero, relies on top_p)
317
- repetition_penalty=repetition_penalty,
318
- )
319
- )
320
- )
321
- elif "dolly" in MODEL_NAME_OR_PATH:
322
- model = AutoModelForCausalLM.from_pretrained(
323
- MODEL_NAME_OR_PATH,
324
- device_map=hf_pipeline_device_type,
325
- torch_dtype=torch_dtype,
326
- )
327
-
328
- pipe = InstructionTextGenerationPipeline(
329
- task=task,
330
- model=model,
331
- tokenizer=tokenizer,
332
- streamer=self.streamer,
333
- max_new_tokens=2048,
334
- temperature=temperature,
335
- return_full_text=True,
336
- repetition_penalty=repetition_penalty,
337
- token=token,
338
- )
339
- else:
340
- if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
341
- use_auth_token = None
342
- model = (
343
- AutoModelForSeq2SeqLM.from_pretrained(
344
- MODEL_NAME_OR_PATH,
345
- config=config,
346
- trust_remote_code=True,
347
- token=token,
348
- )
349
- if is_t5
350
- else AutoModelForCausalLM.from_pretrained(
351
- MODEL_NAME_OR_PATH,
352
- config=config,
353
- trust_remote_code=True,
354
- token=token,
355
- )
356
- )
357
- print(f"Model memory footprint: {model.get_memory_footprint()}")
358
- else:
359
- use_auth_token = token
360
- model = MODEL_NAME_OR_PATH
361
-
362
- pipe = pipeline(
363
- task,
364
- model=model,
365
- tokenizer=tokenizer,
366
- streamer=self.streamer,
367
- return_full_text=return_full_text, # langchain expects the full text
368
- device=hf_pipeline_device_type,
369
- torch_dtype=torch_dtype,
370
- max_new_tokens=2048,
371
- trust_remote_code=True,
372
- temperature=temperature,
373
- top_p=0.95,
374
- top_k=0, # select from top 0 tokens (because zero, relies on top_p)
375
- repetition_penalty=1.115,
376
- token=use_auth_token,
377
- )
378
-
379
- self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
380
- elif self.llm_model_type == "mosaicml":
381
- MODEL_NAME_OR_PATH = os.environ.get("MOSAICML_MODEL_NAME_OR_PATH")
382
- print(f" loading model: {MODEL_NAME_OR_PATH}")
383
-
384
- config = AutoConfig.from_pretrained(
385
- MODEL_NAME_OR_PATH, trust_remote_code=True
386
- )
387
- # config.attn_config["attn_impl"] = "triton"
388
- config.max_seq_len = 16384 if "30b" in MODEL_NAME_OR_PATH else 4096
389
- config.init_device = hf_pipeline_device_type
390
-
391
- model = (
392
- AutoModelForCausalLM.from_pretrained(
393
- MODEL_NAME_OR_PATH,
394
- config=config,
395
- quantization_config=double_quant_config,
396
- trust_remote_code=True,
397
- )
398
- if load_quantized_model is not None
399
- else AutoModelForCausalLM.from_pretrained(
400
- MODEL_NAME_OR_PATH,
401
- config=config,
402
- torch_dtype=torch_dtype,
403
- trust_remote_code=True,
404
- )
405
- )
406
-
407
- print(f"Model loaded on {config.init_device}")
408
- print(f"Model memory footprint: {model.get_memory_footprint()}")
409
-
410
- tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
411
- self._init_streamer(tokenizer, custom_handler)
412
-
413
- # mtp-7b is trained to add "<|endoftext|>" at the end of generations
414
- stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])
415
-
416
- # define custom stopping criteria object
417
- class StopOnTokens(StoppingCriteria):
418
- def __call__(
419
- self,
420
- input_ids: torch.LongTensor,
421
- scores: torch.FloatTensor,
422
- **kwargs,
423
- ) -> bool:
424
- for stop_id in stop_token_ids:
425
- if input_ids[0][-1] == stop_id:
426
- return True
427
- return False
428
-
429
- stopping_criteria = StoppingCriteriaList([StopOnTokens()])
430
-
431
- max_new_tokens = 8192 if "30b" in MODEL_NAME_OR_PATH else 2048
432
- self.max_tokens_limit = max_new_tokens
433
- self.search_kwargs = (
434
- {"k": 8} if "30b" in MODEL_NAME_OR_PATH else self.search_kwargs
435
- )
436
- repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
437
-
438
- pipe = (
439
- pipeline(
440
- model=model,
441
- tokenizer=tokenizer,
442
- streamer=self.streamer,
443
- return_full_text=True, # langchain expects the full text
444
- task="text-generation",
445
- device_map="auto",
446
- # we pass model parameters here too
447
- stopping_criteria=stopping_criteria, # without this model will ramble
448
- temperature=0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
449
- top_p=0.95, # select from top tokens whose probability add up to 15%
450
- top_k=0, # select from top 0 tokens (because zero, relies on top_p)
451
- max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
452
- repetition_penalty=repetition_penalty, # without this output begins repeating
453
- )
454
- if load_quantized_model is not None
455
- else pipeline(
456
- model=model,
457
- tokenizer=tokenizer,
458
- streamer=self.streamer,
459
- return_full_text=True, # langchain expects the full text
460
- task="text-generation",
461
- device=config.init_device,
462
- # we pass model parameters here too
463
- stopping_criteria=stopping_criteria, # without this model will ramble
464
- temperature=0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
465
- top_p=0.95, # select from top tokens whose probability add up to 15%
466
- top_k=0, # select from top 0 tokens (because zero, relies on top_p)
467
- max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
468
- repetition_penalty=repetition_penalty, # without this output begins repeating
469
- )
470
- )
471
- self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
472
- elif self.llm_model_type == "stablelm":
473
- MODEL_NAME_OR_PATH = os.environ.get("STABLELM_MODEL_NAME_OR_PATH")
474
- print(f" loading model: {MODEL_NAME_OR_PATH}")
475
-
476
- config = AutoConfig.from_pretrained(
477
- MODEL_NAME_OR_PATH, trust_remote_code=True
478
- )
479
- # config.attn_config["attn_impl"] = "triton"
480
- # config.max_seq_len = 4096
481
- config.init_device = hf_pipeline_device_type
482
-
483
- model = (
484
- AutoModelForCausalLM.from_pretrained(
485
- MODEL_NAME_OR_PATH,
486
- config=config,
487
- quantization_config=double_quant_config,
488
- trust_remote_code=True,
489
- )
490
- if load_quantized_model is not None
491
- else AutoModelForCausalLM.from_pretrained(
492
- MODEL_NAME_OR_PATH,
493
- config=config,
494
- torch_dtype=torch_dtype,
495
- trust_remote_code=True,
496
- )
497
- )
498
-
499
- print(f"Model loaded on {config.init_device}")
500
- print(f"Model memory footprint: {model.get_memory_footprint()}")
501
-
502
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
503
- self._init_streamer(tokenizer, custom_handler)
504
-
505
- class StopOnTokens(StoppingCriteria):
506
- def __call__(
507
- self,
508
- input_ids: torch.LongTensor,
509
- scores: torch.FloatTensor,
510
- **kwargs,
511
- ) -> bool:
512
- stop_ids = [50278, 50279, 50277, 1, 0]
513
- for stop_id in stop_ids:
514
- if input_ids[0][-1] == stop_id:
515
- return True
516
- return False
517
-
518
- stopping_criteria = StoppingCriteriaList([StopOnTokens()])
519
-
520
- pipe = (
521
- pipeline(
522
- model=model,
523
- tokenizer=tokenizer,
524
- streamer=self.streamer,
525
- return_full_text=True, # langchain expects the full text
526
- task="text-generation",
527
- device_map="auto",
528
- # we pass model parameters here too
529
- stopping_criteria=stopping_criteria, # without this model will ramble
530
- temperature=0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
531
- top_p=0.95, # select from top tokens whose probability add up to 15%
532
- top_k=0, # select from top 0 tokens (because zero, relies on top_p)
533
- max_new_tokens=2048, # mex number of tokens to generate in the output
534
- repetition_penalty=1.25, # without this output begins repeating
535
- )
536
- if load_quantized_model is not None
537
- else pipeline(
538
- model=model,
539
- tokenizer=tokenizer,
540
- streamer=self.streamer,
541
- return_full_text=True, # langchain expects the full text
542
- task="text-generation",
543
- device=config.init_device,
544
- # we pass model parameters here too
545
- stopping_criteria=stopping_criteria, # without this model will ramble
546
- temperature=0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
547
- top_p=0.95, # select from top tokens whose probability add up to 15%
548
- top_k=0, # select from top 0 tokens (because zero, relies on top_p)
549
- max_new_tokens=2048, # mex number of tokens to generate in the output
550
- repetition_penalty=1.05, # without this output begins repeating
551
- )
552
- )
553
- self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
554
-
555
- print("initialization complete")
556
-
557
- def get_chain(self, tracing: bool = False) -> ConversationalRetrievalChain:
558
- if tracing:
559
- tracer = LangChainTracer()
560
- tracer.load_default_session()
561
-
562
- if self.llm is None:
563
- self.init()
564
-
565
- qa = ConversationalRetrievalChain.from_llm(
566
- self.llm,
567
- self.vectorstore.as_retriever(search_kwargs=self.search_kwargs),
568
- max_tokens_limit=self.max_tokens_limit,
569
- return_source_documents=True,
570
- )
571
-
572
- return qa
573
-
574
- def call(self, inputs, streaming_handler, q: Queue = None, tracing: bool = False):
575
- print(inputs)
576
-
577
- if self.streamer is not None and isinstance(
578
- self.streamer, TextIteratorStreamer
579
- ):
580
- self.streamer.reset(q)
581
-
582
- qa = self.get_chain(tracing)
583
- result = (
584
- self._run_qa_chain(
585
- qa,
586
- inputs,
587
- streaming_handler,
588
- )
589
- if streaming_handler is not None
590
- else qa(inputs)
591
- )
592
-
593
- result["answer"] = remove_extra_spaces(result["answer"])
594
-
595
- base_url = os.environ.get("PDF_FILE_BASE_URL")
596
- if base_url is not None and len(base_url) > 0:
597
- documents = result["source_documents"]
598
- for doc in documents:
599
- source = doc.metadata["source"]
600
- title = source.split("/")[-1]
601
- doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
602
-
603
- return result
604
-
605
- def _run_qa_chain(self, qa, inputs, streaming_handler):
606
- que = Queue()
607
-
608
- t = Thread(
609
- target=lambda qa, inputs, q, sh: q.put(qa(inputs, callbacks=[sh])),
610
- args=(qa, inputs, que, streaming_handler),
611
- )
612
- t.start()
613
-
614
- if self.streamer is not None and isinstance(
615
- self.streamer, TextIteratorStreamer
616
- ):
617
- count = 2 if len(inputs.get("chat_history")) > 0 else 1
618
-
619
- while count > 0:
620
- try:
621
- for token in self.streamer:
622
- streaming_handler.on_llm_new_token(token)
623
-
624
- self.streamer.reset()
625
- count -= 1
626
- except Exception:
627
- print("nothing generated yet - retry in 0.5s")
628
- time.sleep(0.5)
629
-
630
- t.join()
631
- return que.get()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/YT_LLaMA2_7B_Chat_LangChain_Basics.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
test.py CHANGED
@@ -8,11 +8,12 @@ from langchain.callbacks.base import BaseCallbackHandler
8
  from langchain.schema import HumanMessage
9
 
10
  from app_modules.init import app_init
 
11
  from app_modules.llm_loader import LLMLoader
12
  from app_modules.utils import get_device_types, print_llm_response
13
 
14
 
15
- class TestLLMLoader: # (unittest.TestCase):
16
  question = "What's the capital city of Malaysia?"
17
 
18
  def run_test_case(self, llm_model_type, query):
@@ -50,6 +51,50 @@ class TestLLMLoader: # (unittest.TestCase):
50
  self.run_test_case("huggingface", self.question)
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  class TestQAChain(unittest.TestCase):
54
  qa_chain: any
55
  question = "What's deep learning?"
@@ -57,16 +102,25 @@ class TestQAChain(unittest.TestCase):
57
  def run_test_case(self, llm_model_type, query):
58
  start = timer()
59
  os.environ["LLM_MODEL_TYPE"] = llm_model_type
60
- qa_chain = app_init()
61
  end = timer()
62
  print(f"App initialized in {end - start:.3f}s")
63
 
64
- inputs = {"question": query, "chat_history": []}
 
65
  result = qa_chain.call_chain(inputs, None)
66
  end2 = timer()
67
  print(f"Inference completed in {end2 - end:.3f}s")
68
  print_llm_response(result)
69
 
 
 
 
 
 
 
 
 
70
  def test_openai(self):
71
  self.run_test_case("openai", self.question)
72
 
 
8
  from langchain.schema import HumanMessage
9
 
10
  from app_modules.init import app_init
11
+ from app_modules.llm_chat_chain import ChatChain
12
  from app_modules.llm_loader import LLMLoader
13
  from app_modules.utils import get_device_types, print_llm_response
14
 
15
 
16
+ class TestLLMLoader(unittest.TestCase):
17
  question = "What's the capital city of Malaysia?"
18
 
19
  def run_test_case(self, llm_model_type, query):
 
51
  self.run_test_case("huggingface", self.question)
52
 
53
 
54
+ class TestChatChain(unittest.TestCase):
55
+ question = "What's the capital city of Malaysia?"
56
+
57
+ def run_test_case(self, llm_model_type, query):
58
+ n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
59
+
60
+ hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
61
+ print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
62
+ print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
63
+
64
+ llm_loader = LLMLoader(llm_model_type)
65
+ start = timer()
66
+ llm_loader.init(
67
+ n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type
68
+ )
69
+ chat = ChatChain(llm_loader)
70
+ end = timer()
71
+ print(f"Model loaded in {end - start:.3f}s")
72
+
73
+ inputs = {"question": query}
74
+ result = chat.call_chain(inputs, None)
75
+ end2 = timer()
76
+ print(f"Inference completed in {end2 - end:.3f}s")
77
+ print(result)
78
+
79
+ inputs = {"question": "how many people?"}
80
+ result = chat.call_chain(inputs, None)
81
+ end3 = timer()
82
+ print(f"Inference completed in {end3 - end2:.3f}s")
83
+ print(result)
84
+
85
+ def test_openai(self):
86
+ self.run_test_case("openai", self.question)
87
+
88
+ def test_llamacpp(self):
89
+ self.run_test_case("llamacpp", self.question)
90
+
91
+ def test_gpt4all_j(self):
92
+ self.run_test_case("gpt4all-j", self.question)
93
+
94
+ def test_huggingface(self):
95
+ self.run_test_case("huggingface", self.question)
96
+
97
+
98
  class TestQAChain(unittest.TestCase):
99
  qa_chain: any
100
  question = "What's deep learning?"
 
102
  def run_test_case(self, llm_model_type, query):
103
  start = timer()
104
  os.environ["LLM_MODEL_TYPE"] = llm_model_type
105
+ qa_chain = app_init()[1]
106
  end = timer()
107
  print(f"App initialized in {end - start:.3f}s")
108
 
109
+ chat_history = []
110
+ inputs = {"question": query, "chat_history": chat_history}
111
  result = qa_chain.call_chain(inputs, None)
112
  end2 = timer()
113
  print(f"Inference completed in {end2 - end:.3f}s")
114
  print_llm_response(result)
115
 
116
+ chat_history.append((query, result["answer"]))
117
+
118
+ inputs = {"question": "tell me more", "chat_history": chat_history}
119
+ result = qa_chain.call_chain(inputs, None)
120
+ end3 = timer()
121
+ print(f"Inference completed in {end3 - end2:.3f}s")
122
+ print(result)
123
+
124
  def test_openai(self):
125
  self.run_test_case("openai", self.question)
126