KingNish commited on
Commit
4f3004e
1 Parent(s): a6d2807

Update chatbot.py

Browse files
Files changed (1) hide show
  1. chatbot.py +17 -32
chatbot.py CHANGED
@@ -198,6 +198,8 @@ client_mixtral = InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
198
  client_mistral = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
199
  generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
200
 
 
 
201
  @spaces.GPU(duration=60, queue=False)
202
  def model_inference( user_prompt, chat_history, web_search):
203
  if not user_prompt["files"]:
@@ -242,45 +244,28 @@ def model_inference( user_prompt, chat_history, web_search):
242
  output += response.token.text
243
  yield output
244
  else:
245
- message = user_prompt
246
- if len(message["files"]) == 1:
247
- image = [message["files"][0].path]
248
- elif len(message["files"]) > 1:
249
- image = [msg.path for msg in message["files"]]
250
 
251
- txt = message["text"]
 
 
252
 
253
  video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
254
  image_extensions = Image.registered_extensions()
255
  image_extensions = tuple([ex for ex, f in image_extensions.items()])
256
-
257
- if len(image) == 1:
258
- if image.endswith(video_extensions):
259
- image = sample_frames(image)
260
- print(len(image))
261
- image_tokens = "<image>" * int(len(image))
262
- prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
263
- elif image.endswith(image_extensions):
264
- image = Image.open(image).convert("RGB")
265
- prompt = f"<|im_start|>user <image>\n{user_prompt}<|im_end|><|im_start|>assistant"
266
-
267
- elif len(image) > 1:
268
- image_list = []
269
 
270
- for img in image:
271
- if img.endswith(image_extensions):
272
- img = Image.open(img).convert("RGB")
273
- image_list.append(img)
 
 
 
 
 
 
 
274
 
275
- elif img.endswith(video_extensions):
276
- frames = sample_frames(img)
277
- for frame in frames:
278
- image_list.append(frame)
279
-
280
- toks = "<image>" * len(image_list)
281
- prompt = f"<|im_start|>user {toks}\n{user_prompt}<|im_end|><|im_start|>assistant"
282
- image = image_list
283
-
284
  inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
285
  streamer = TextIteratorStreamer(processor, skip_prompt=True, **{"skip_special_tokens": True})
286
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
 
198
  client_mistral = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
199
  generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
200
 
201
+ system_llava = "<|im_start|>system\nYou are OpenGPT 4o, an exceptionally capable and versatile AI assistant made by KingNish. Your task is to fulfill users query in best possible way. You are provided with image, videos and 3d structures as input with question your task is to give best possible detailed results to user according to their query. Reply the question asked by user properly and best possible way.<|im_end|>"
202
+
203
  @spaces.GPU(duration=60, queue=False)
204
  def model_inference( user_prompt, chat_history, web_search):
205
  if not user_prompt["files"]:
 
244
  output += response.token.text
245
  yield output
246
  else:
247
+ image = user_prompt["files"][-1]
 
 
 
 
248
 
249
+ txt = user_prompt["text"]
250
+ img = user_prompt["files"]
251
+ ext_buffer =f"'user\ntext': '{txt}', 'files': '{img}' assistant"
252
 
253
  video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
254
  image_extensions = Image.registered_extensions()
255
  image_extensions = tuple([ex for ex, f in image_extensions.items()])
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ if image.endswith(video_extensions):
258
+ image = sample_frames(image)
259
+ print(len(image))
260
+ image_tokens = "<image>" * int(len(image))
261
+ prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
262
+
263
+ elif image.endswith(image_extensions):
264
+ image = Image.open(image).convert("RGB")
265
+ prompt = f"<|im_start|>user <image>\n{user_prompt}<|im_end|><|im_start|>assistant"
266
+
267
+ final_prompt = f"{system_llava}\n{prompt}"
268
 
 
 
 
 
 
 
 
 
 
269
  inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
270
  streamer = TextIteratorStreamer(processor, skip_prompt=True, **{"skip_special_tokens": True})
271
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)