KingNish commited on
Commit
cd4b4e1
1 Parent(s): 92d81f0

Update chatbot.py

Browse files
Files changed (1) hide show
  1. chatbot.py +5 -12
chatbot.py CHANGED
@@ -27,8 +27,8 @@ import io # Add this import for working with image bytes
27
 
28
  # You can also use models that are commented below
29
  # model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
30
- model_id = "llava-hf/llava-interleave-qwen-7b-hf"
31
- # model_id = "llava-hf/llava-interleave-qwen-7b-dpo-hf"
32
  processor = LlavaProcessor.from_pretrained(model_id)
33
  model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True)
34
  model.to("cuda")
@@ -38,14 +38,7 @@ def sample_frames(video_file, num_frames) :
38
  try:
39
  video = cv2.VideoCapture(video_file)
40
  total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
41
- fps = int(video.get(cv2.CAP_PROP_FPS))
42
- # extracts 5 images/sec of video
43
- if (total_frames/fps) < 3:
44
- num_frames = 12
45
- elif (total_frames/fps) > 5:
46
- num_frames = 24
47
- else:
48
- num_frames = ((total_frames//fps)*5)
49
  interval = total_frames // num_frames
50
  frames = []
51
  for i in range(total_frames):
@@ -199,7 +192,7 @@ client_mixtral = InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
199
  client_mistral = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
200
  generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
201
 
202
- system_llava = "<|im_start|>system\nYou are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Your task is to fulfill users query in best possible way. You are provided with image, videos and 3d structures as input with question your task is to give best possible result and explaination to user.<|im_end|>"
203
 
204
  @spaces.GPU(duration=60, queue=False)
205
  def model_inference( user_prompt, chat_history, web_search):
@@ -256,7 +249,7 @@ def model_inference( user_prompt, chat_history, web_search):
256
  image_extensions = tuple([ex for ex, f in image_extensions.items()])
257
 
258
  if image.endswith(video_extensions):
259
- image = sample_frames(image, 12)
260
  image_tokens = "<image>" * int(len(image))
261
  prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
262
 
 
27
 
28
  # You can also use models that are commented below
29
  # model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
30
+ # model_id = "llava-hf/llava-interleave-qwen-7b-hf"
31
+ model_id = "llava-hf/llava-interleave-qwen-7b-dpo-hf"
32
  processor = LlavaProcessor.from_pretrained(model_id)
33
  model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True)
34
  model.to("cuda")
 
38
  try:
39
  video = cv2.VideoCapture(video_file)
40
  total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
41
+ num_frames = 12
 
 
 
 
 
 
 
42
  interval = total_frames // num_frames
43
  frames = []
44
  for i in range(total_frames):
 
192
  client_mistral = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
193
  generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
194
 
195
+ system_llava = "<|im_start|>system\nYou are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Your task is to fulfill users query in best possible way. You are provided with image, videos and 3d structures as input with question your task is to give best possible detailed result and explaination to user.<|im_end|>"
196
 
197
  @spaces.GPU(duration=60, queue=False)
198
  def model_inference( user_prompt, chat_history, web_search):
 
249
  image_extensions = tuple([ex for ex, f in image_extensions.items()])
250
 
251
  if image.endswith(video_extensions):
252
+ image = sample_frames(image)
253
  image_tokens = "<image>" * int(len(image))
254
  prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
255