KingNish commited on
Commit
5b0f841
1 Parent(s): 65e2803

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +307 -157
app.py CHANGED
@@ -2,18 +2,19 @@ import os
2
  import subprocess
3
  import random
4
 
5
- # Install flash attention
6
  subprocess.run(
7
  "pip install flash-attn --no-build-isolation",
8
  env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
9
  shell=True,
10
  )
11
-
 
 
12
  import copy
13
  import spaces
14
  import time
15
  import torch
16
-
17
  from threading import Thread
18
  from typing import List, Dict, Union
19
  import urllib
@@ -33,14 +34,18 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
33
  from transformers import AutoModel
34
  from transformers import AutoProcessor
35
 
 
36
  model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
37
  processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
38
 
 
39
  @spaces.GPU(queue=False)
40
  def videochat(image3, prompt3):
 
41
  inputs = processor(text=[prompt3], images=[image3], return_tensors="pt")
 
42
  with torch.inference_mode():
43
- output = model3.generate(
44
  **inputs,
45
  do_sample=False,
46
  use_cache=True,
@@ -48,40 +53,47 @@ def videochat(image3, prompt3):
48
  eos_token_id=151645,
49
  pad_token_id=processor.tokenizer.pad_token_id
50
  )
51
-
52
- prompt_len = inputs["input_ids"].shape[1]
53
  decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
54
  if decoded_text.endswith("<|im_end|>"):
55
  decoded_text = decoded_text[:-10]
56
  yield decoded_text
57
 
 
58
  theme = gr.themes.Soft(
59
  primary_hue="blue",
60
  secondary_hue="orange",
61
  neutral_hue="gray",
62
- font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif']).set(
63
- body_background_fill_dark="#111111",
64
- block_background_fill_dark="#111111",
65
- block_border_width="1px",
66
- block_title_background_fill_dark="#1e1c26",
67
- input_background_fill_dark="#292733",
68
- button_secondary_background_fill_dark="#24212b",
69
- border_color_primary_dark="#343140",
70
- background_fill_secondary_dark="#111111",
71
- color_accent_soft_dark="transparent")
 
 
72
 
 
73
  default_lang = "en"
 
 
74
 
75
- engines = { default_lang: nemo(default_lang) }
76
-
77
  def transcribe(audio):
78
  lang = "en"
79
  model = engines[lang]
80
  text = model.stt_file(audio)[0]
81
  return text
82
 
 
83
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
84
 
 
85
  def client_fn(model):
86
  if "Nous" in model:
87
  return InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
@@ -93,46 +105,50 @@ def client_fn(model):
93
  return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
94
  elif "Zephyr" in model:
95
  return InferenceClient("HuggingFaceH4/zephyr-7b-beta")
96
- else:
97
  return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
98
 
 
99
  def randomize_seed_fn(seed: int) -> int:
100
  seed = random.randint(0, 999999)
101
  return seed
102
 
 
103
  system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
104
 
 
105
  def models(text, model="Mixtral 8x7B", seed=42):
106
-
107
  seed = int(randomize_seed_fn(seed))
108
- generator = torch.Generator().manual_seed(seed)
109
-
110
  client = client_fn(model)
111
  generate_kwargs = dict(
112
  max_new_tokens=512,
113
  seed=seed,
114
  )
115
-
116
  formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
117
  stream = client.text_generation(
118
- formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
 
119
  output = ""
120
  for response in stream:
121
  if not response.token.text == "</s>":
122
  output += response.token.text
123
-
124
  return output
125
 
 
126
  async def respond(audio, model, seed):
127
  user = transcribe(audio)
128
  reply = models(user, model, seed)
129
  communicate = edge_tts.Communicate(reply)
 
130
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
131
  tmp_path = tmp_file.name
132
  await communicate.save(tmp_path)
133
  yield tmp_path
134
 
 
135
  DEVICE = torch.device("cuda")
 
136
  MODELS = {
137
  "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
138
  "HuggingFaceM4/idefics2-8b-chatty",
@@ -140,28 +156,20 @@ MODELS = {
140
  _attn_implementation="flash_attention_2",
141
  ).to(DEVICE),
142
  }
 
143
  PROCESSOR = AutoProcessor.from_pretrained(
144
  "HuggingFaceM4/idefics2-8b",
145
  )
146
 
 
147
  SYSTEM_PROMPT = [
148
  {
149
  "role": "system",
150
  "content": [
151
  {
152
  "type": "text",
153
- "text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include:
154
- - **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information.
155
- - **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals:
156
- > ![](https://image.pollinations.ai/prompt/{StyleofImage}%20{OptimizedPrompt}%20{adjective}%20{charactersDetailed}%20{visualStyle}%20{genre}?width={width}&height={height}&nologo=poll&nofeed=yes&seed={random})
157
- For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience.
158
- For instance, if the User requests:
159
- [USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars.
160
- [OpenGPT 4o] Generating Image you requested:
161
- ![](https://image.pollinations.ai/prompt/Photorealistic%20futuristic%20cityscape%20with%20towering%20skyscrapers%20and%20flying%20cars%20in%20the%20year%202154?width=1024&height=768&nologo=poll&nofeed=yes&seed=85172)
162
- **Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.
163
- Note: Make sure to always provide image links starting with ! .As given in examples.
164
- My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question.""" },
165
  ],
166
  },
167
  {
@@ -174,7 +182,7 @@ My ultimate goal is to offer a seamless and enjoyable experience, providing assi
174
  ],
175
  }
176
  ]
177
-
178
  examples_path = os.path.dirname(__file__)
179
  EXAMPLES = [
180
  [
@@ -196,7 +204,8 @@ EXAMPLES = [
196
  [
197
  {
198
  "text": "Identify two famous people in the modern world.",
199
- "files": [f"{examples_path}/example_images/elon_smoking.jpg", f"{examples_path}/example_images/steve_jobs.jpg",]
 
200
  }
201
  ],
202
  [
@@ -226,7 +235,7 @@ EXAMPLES = [
226
  "text": "What is formed by the deposition of the weathered remains of other rocks?",
227
  "files": [f"{examples_path}/example_images/ai2d_example.jpeg"],
228
  }
229
- ],
230
  [
231
  {
232
  "text": "What's unusual about this image?",
@@ -235,14 +244,16 @@ EXAMPLES = [
235
  ],
236
  ]
237
 
 
238
  BOT_AVATAR = "OpenAI_logo.png"
239
 
 
240
 
241
- # Chatbot utils
242
  def turn_is_pure_media(turn):
243
  return turn[1] is None
244
 
245
-
246
  def load_image_from_url(url):
247
  with urllib.request.urlopen(url) as response:
248
  image_data = response.read()
@@ -250,7 +261,7 @@ def load_image_from_url(url):
250
  image = PIL.Image.open(image_stream)
251
  return image
252
 
253
-
254
  def img_to_bytes(image_path):
255
  image = PIL.Image.open(image_path).convert(mode='RGB')
256
  buffer = io.BytesIO()
@@ -259,9 +270,9 @@ def img_to_bytes(image_path):
259
  image.close()
260
  return img_bytes
261
 
262
-
263
  def format_user_prompt_with_im_history_and_system_conditioning(
264
- user_prompt, chat_history
265
  ) -> List[Dict[str, Union[List, str]]]:
266
  """
267
  Produce the resulting list that needs to go inside the processor. It handles the potential image(s), the history, and the system conditioning.
@@ -273,7 +284,6 @@ def format_user_prompt_with_im_history_and_system_conditioning(
273
  for content in resulting_message["content"]:
274
  if content["type"] == "image":
275
  resulting_images.append(load_image_from_url(content["image"]))
276
-
277
  # Format history
278
  for turn in chat_history:
279
  if not resulting_messages or (
@@ -285,7 +295,6 @@ def format_user_prompt_with_im_history_and_system_conditioning(
285
  "content": [],
286
  }
287
  )
288
-
289
  if turn_is_pure_media(turn):
290
  media = turn[0][0]
291
  resulting_messages[-1]["content"].append({"type": "image"})
@@ -301,7 +310,6 @@ def format_user_prompt_with_im_history_and_system_conditioning(
301
  "content": [{"type": "text", "text": user_utterance.strip()}],
302
  }
303
  )
304
-
305
  # Format current input
306
  if not user_prompt["files"]:
307
  resulting_messages.append(
@@ -311,19 +319,18 @@ def format_user_prompt_with_im_history_and_system_conditioning(
311
  }
312
  )
313
  else:
314
- # Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
315
  resulting_messages.append(
316
  {
317
  "role": "user",
318
  "content": [{"type": "image"}] * len(user_prompt["files"])
319
- + [{"type": "text", "text": user_prompt["text"]}],
320
  }
321
  )
322
  resulting_images.extend([PIL.Image.open(path) for path in user_prompt["files"]])
323
-
324
  return resulting_messages, resulting_images
325
 
326
-
327
  def extract_images_from_msg_list(msg_list):
328
  all_images = []
329
  for msg in msg_list:
@@ -332,80 +339,221 @@ def extract_images_from_msg_list(msg_list):
332
  all_images.append(c_)
333
  return all_images
334
 
 
 
 
 
 
 
 
 
 
 
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  @spaces.GPU(duration=30, queue=False)
337
  def model_inference(
338
- user_prompt,
339
- chat_history,
340
- model_selector,
341
- decoding_strategy,
342
- temperature,
343
- max_new_tokens,
344
- repetition_penalty,
345
- top_p,
 
346
  ):
347
- if user_prompt["text"].strip() == "" and not user_prompt["files"]:
348
- gr.Error("Please input a query and optionally an image(s).")
349
-
350
- if user_prompt["text"].strip() == "" and user_prompt["files"]:
351
- gr.Error("Please input a text query along with the image(s).")
352
-
353
- streamer = TextIteratorStreamer(
354
- PROCESSOR.tokenizer,
355
- skip_prompt=True,
356
- timeout=120.0,
357
- )
358
-
359
- generation_args = {
360
- "max_new_tokens": max_new_tokens,
361
- "repetition_penalty": repetition_penalty,
362
- "streamer": streamer,
363
- }
364
-
365
- assert decoding_strategy in [
366
- "Greedy",
367
- "Top P Sampling",
368
- ]
369
- if decoding_strategy == "Greedy":
370
- generation_args["do_sample"] = False
371
- elif decoding_strategy == "Top P Sampling":
372
- generation_args["temperature"] = temperature
373
- generation_args["do_sample"] = True
374
- generation_args["top_p"] = top_p
375
-
376
- # Creating model inputs
377
- (
378
- resulting_text,
379
- resulting_images,
380
- ) = format_user_prompt_with_im_history_and_system_conditioning(
381
- user_prompt=user_prompt,
382
- chat_history=chat_history,
383
- )
384
- prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
385
- inputs = PROCESSOR(
386
- text=prompt,
387
- images=resulting_images if resulting_images else None,
388
- return_tensors="pt",
389
- )
390
- inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
391
- generation_args.update(inputs)
392
-
393
- thread = Thread(
394
- target=MODELS[model_selector].generate,
395
- kwargs=generation_args,
396
- )
397
- thread.start()
398
-
399
- print("Start generating")
400
- acc_text = ""
401
- for text_token in streamer:
402
- time.sleep(0.01)
403
- acc_text += text_token
404
- if acc_text.endswith("<end_of_utterance>"):
405
- acc_text = acc_text[:-18]
406
- yield acc_text
407
-
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  FEATURES = datasets.Features(
410
  {
411
  "model_selector": datasets.Value("string"),
@@ -416,11 +564,10 @@ FEATURES = datasets.Features(
416
  "max_new_tokens": datasets.Value("int32"),
417
  "repetition_penalty": datasets.Value("float32"),
418
  "top_p": datasets.Value("int32"),
419
- }
420
- )
421
-
422
 
423
- # Hyper-parameters for generation
424
  max_new_tokens = gr.Slider(
425
  minimum=2048,
426
  maximum=16000,
@@ -469,22 +616,23 @@ top_p = gr.Slider(
469
  info="Higher values are equivalent to sampling more low-probability tokens.",
470
  )
471
 
472
-
473
  chatbot = gr.Chatbot(
474
  label="OpnGPT-4o-Chatty",
475
  avatar_images=[None, BOT_AVATAR],
476
- show_copy_button=True,
477
- likeable=True,
478
  layout="panel"
479
  )
 
480
 
481
- output=gr.Textbox(label="Prompt")
482
 
 
483
  with gr.Blocks(
484
- fill_height=True,
485
- css=""".gradio-container .avatar-container {height: 40px width: 40px !important;} #duplicate-button {margin: auto; color: white; background: #f1a139; border-radius: 100vh; margin-top: 2px; margin-bottom: 2px;}""",
486
  ) as chat:
487
-
488
  gr.Markdown("# Image Chat, Image Generation, Image classification and Normal Chat")
489
  with gr.Row(elem_id="model_selector_row"):
490
  model_selector = gr.Dropdown(
@@ -496,17 +644,16 @@ with gr.Blocks(
496
  label="Model",
497
  visible=False,
498
  )
499
-
500
  decoding_strategy.change(
501
  fn=lambda selection: gr.Slider(
502
  visible=(
503
- selection
504
- in [
505
- "contrastive_sampling",
506
- "beam_sampling",
507
- "Top P Sampling",
508
- "sampling_top_k",
509
- ]
510
  )
511
  ),
512
  inputs=decoding_strategy,
@@ -517,7 +664,6 @@ with gr.Blocks(
517
  inputs=decoding_strategy,
518
  outputs=top_p,
519
  )
520
-
521
  gr.ChatInterface(
522
  fn=model_inference,
523
  chatbot=chatbot,
@@ -531,30 +677,34 @@ with gr.Blocks(
531
  max_new_tokens,
532
  repetition_penalty,
533
  top_p,
534
- ],
 
535
  )
536
 
537
- with gr.Blocks() as voice:
538
- with gr.Row():
539
- select = gr.Dropdown([ 'Nous Hermes Mixtral 8x7B DPO', 'Mixtral 8x7B','StarChat2 15b','Mistral 7B v0.3','Phi 3 mini', 'Zephyr 7b' ], value="Mistral 7B v0.3", label="Select Model")
 
 
540
  seed = gr.Slider(
541
- label="Seed",
542
- minimum=0,
543
- maximum=999999,
544
- step=1,
545
- value=0,
546
- visible=False
547
  )
548
  input = gr.Audio(label="User", sources="microphone", type="filepath", waveform_options=False)
549
  output = gr.Audio(label="AI", type="filepath",
550
- interactive=False,
551
- autoplay=True,
552
- elem_classes="audio")
553
  gr.Interface(
554
- fn=respond,
555
- inputs=[input, select,seed],
556
- outputs=[output], api_name="translate", live=True)
557
 
 
558
  with gr.Blocks() as livechat:
559
  gr.Interface(
560
  fn=videochat,
 
2
  import subprocess
3
  import random
4
 
5
+ # Install flash attention, skipping CUDA build if necessary
6
  subprocess.run(
7
  "pip install flash-attn --no-build-isolation",
8
  env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
9
  shell=True,
10
  )
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ # Import necessary libraries
14
  import copy
15
  import spaces
16
  import time
17
  import torch
 
18
  from threading import Thread
19
  from typing import List, Dict, Union
20
  import urllib
 
34
  from transformers import AutoModel
35
  from transformers import AutoProcessor
36
 
37
+ # Load pre-trained models for image captioning and language modeling
38
  model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
39
  processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
40
 
41
+ # Define a function for image captioning
42
  @spaces.GPU(queue=False)
43
  def videochat(image3, prompt3):
44
+ # Process input image and prompt
45
  inputs = processor(text=[prompt3], images=[image3], return_tensors="pt")
46
+ # Generate captions
47
  with torch.inference_mode():
48
+ output = model3.generate(
49
  **inputs,
50
  do_sample=False,
51
  use_cache=True,
 
53
  eos_token_id=151645,
54
  pad_token_id=processor.tokenizer.pad_token_id
55
  )
56
+ prompt_len = inputs["input_ids"].shape[1]
57
+ # Decode and return the generated captions
58
  decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
59
  if decoded_text.endswith("<|im_end|>"):
60
  decoded_text = decoded_text[:-10]
61
  yield decoded_text
62
 
63
+ # Define Gradio theme
64
  theme = gr.themes.Soft(
65
  primary_hue="blue",
66
  secondary_hue="orange",
67
  neutral_hue="gray",
68
+ font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif']
69
+ ).set(
70
+ body_background_fill_dark="#111111",
71
+ block_background_fill_dark="#111111",
72
+ block_border_width="1px",
73
+ block_title_background_fill_dark="#1e1c26",
74
+ input_background_fill_dark="#292733",
75
+ button_secondary_background_fill_dark="#24212b",
76
+ border_color_primary_dark="#343140",
77
+ background_fill_secondary_dark="#111111",
78
+ color_accent_soft_dark="transparent"
79
+ )
80
 
81
+ # Set default language for speech recognition
82
  default_lang = "en"
83
+ # Initialize speech recognition engine
84
+ engines = {default_lang: nemo(default_lang)}
85
 
86
+ # Define a function for speech-to-text transcription
 
87
  def transcribe(audio):
88
  lang = "en"
89
  model = engines[lang]
90
  text = model.stt_file(audio)[0]
91
  return text
92
 
93
+ # Get Hugging Face API token
94
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
95
 
96
+ # Define a function to get the appropriate InferenceClient based on model name
97
  def client_fn(model):
98
  if "Nous" in model:
99
  return InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
 
105
  return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
106
  elif "Zephyr" in model:
107
  return InferenceClient("HuggingFaceH4/zephyr-7b-beta")
108
+ else:
109
  return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
110
 
111
+ # Define a function to generate a random seed
112
  def randomize_seed_fn(seed: int) -> int:
113
  seed = random.randint(0, 999999)
114
  return seed
115
 
116
+ # System instructions for the language model
117
  system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
118
 
119
+ # Define a function for language modeling
120
  def models(text, model="Mixtral 8x7B", seed=42):
 
121
  seed = int(randomize_seed_fn(seed))
122
+ generator = torch.Generator().manual_seed(seed)
 
123
  client = client_fn(model)
124
  generate_kwargs = dict(
125
  max_new_tokens=512,
126
  seed=seed,
127
  )
 
128
  formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
129
  stream = client.text_generation(
130
+ formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False
131
+ )
132
  output = ""
133
  for response in stream:
134
  if not response.token.text == "</s>":
135
  output += response.token.text
 
136
  return output
137
 
138
+ # Define an asynchronous function to handle voice input and generate responses
139
  async def respond(audio, model, seed):
140
  user = transcribe(audio)
141
  reply = models(user, model, seed)
142
  communicate = edge_tts.Communicate(reply)
143
+ # Save the generated speech to a temporary file
144
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
145
  tmp_path = tmp_file.name
146
  await communicate.save(tmp_path)
147
  yield tmp_path
148
 
149
+ # Set device to CUDA if available, otherwise CPU
150
  DEVICE = torch.device("cuda")
151
+ # Load pre-trained models for image-based chat
152
  MODELS = {
153
  "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
154
  "HuggingFaceM4/idefics2-8b-chatty",
 
156
  _attn_implementation="flash_attention_2",
157
  ).to(DEVICE),
158
  }
159
+ # Load pre-trained processor for image-based chat
160
  PROCESSOR = AutoProcessor.from_pretrained(
161
  "HuggingFaceM4/idefics2-8b",
162
  )
163
 
164
+ # Define system prompt for the image-based chat model
165
  SYSTEM_PROMPT = [
166
  {
167
  "role": "system",
168
  "content": [
169
  {
170
  "type": "text",
171
+ "text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include: - **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information. - **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals: > ![](https://image.pollinations.ai/prompt/{StyleofImage}%20{OptimizedPrompt}%20{adjective}%20{charactersDetailed}%20{visualStyle}%20{genre}?width={width}&height={height}&nologo=poll&nofeed=yes&seed={random})For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. For instance, if the User requests: [USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars. [OpenGPT 4o] Generating Image you requested: ![](https://image.pollinations.ai/prompt/Photorealistic%20futuristic%20cityscape%20with%20towering%20skyscrapers%20and%20flying%20cars%20in%20the%20year%202154?width=1024&height=768&nologo=poll&nofeed=yes&seed=85172)**Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.Note: Make sure to always provide image links starting with ! .As given in examples. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question."""
172
+ },
 
 
 
 
 
 
 
 
 
 
173
  ],
174
  },
175
  {
 
182
  ],
183
  }
184
  ]
185
+ # Path to example images
186
  examples_path = os.path.dirname(__file__)
187
  EXAMPLES = [
188
  [
 
204
  [
205
  {
206
  "text": "Identify two famous people in the modern world.",
207
+ "files": [f"{examples_path}/example_images/elon_smoking.jpg",
208
+ f"{examples_path}/example_images/steve_jobs.jpg", ]
209
  }
210
  ],
211
  [
 
235
  "text": "What is formed by the deposition of the weathered remains of other rocks?",
236
  "files": [f"{examples_path}/example_images/ai2d_example.jpeg"],
237
  }
238
+ ],
239
  [
240
  {
241
  "text": "What's unusual about this image?",
 
244
  ],
245
  ]
246
 
247
+ # Set bot avatar image
248
  BOT_AVATAR = "OpenAI_logo.png"
249
 
250
+ # Chatbot utility functions
251
 
252
+ # Check if a turn in the chat history only contains media
253
  def turn_is_pure_media(turn):
254
  return turn[1] is None
255
 
256
+ # Load image from URL
257
  def load_image_from_url(url):
258
  with urllib.request.urlopen(url) as response:
259
  image_data = response.read()
 
261
  image = PIL.Image.open(image_stream)
262
  return image
263
 
264
+ # Convert image to bytes
265
  def img_to_bytes(image_path):
266
  image = PIL.Image.open(image_path).convert(mode='RGB')
267
  buffer = io.BytesIO()
 
270
  image.close()
271
  return img_bytes
272
 
273
+ # Format user prompt with image history and system conditioning
274
  def format_user_prompt_with_im_history_and_system_conditioning(
275
+ user_prompt, chat_history
276
  ) -> List[Dict[str, Union[List, str]]]:
277
  """
278
  Produce the resulting list that needs to go inside the processor. It handles the potential image(s), the history, and the system conditioning.
 
284
  for content in resulting_message["content"]:
285
  if content["type"] == "image":
286
  resulting_images.append(load_image_from_url(content["image"]))
 
287
  # Format history
288
  for turn in chat_history:
289
  if not resulting_messages or (
 
295
  "content": [],
296
  }
297
  )
 
298
  if turn_is_pure_media(turn):
299
  media = turn[0][0]
300
  resulting_messages[-1]["content"].append({"type": "image"})
 
310
  "content": [{"type": "text", "text": user_utterance.strip()}],
311
  }
312
  )
 
313
  # Format current input
314
  if not user_prompt["files"]:
315
  resulting_messages.append(
 
319
  }
320
  )
321
  else:
322
+ # Choosing to put the image first (i.e. before the text), but this is an arbitrary choice.
323
  resulting_messages.append(
324
  {
325
  "role": "user",
326
  "content": [{"type": "image"}] * len(user_prompt["files"])
327
+ + [{"type": "text", "text": user_prompt["text"]}],
328
  }
329
  )
330
  resulting_images.extend([PIL.Image.open(path) for path in user_prompt["files"]])
 
331
  return resulting_messages, resulting_images
332
 
333
+ # Extract images from a list of messages
334
  def extract_images_from_msg_list(msg_list):
335
  all_images = []
336
  for msg in msg_list:
 
339
  all_images.append(c_)
340
  return all_images
341
 
342
+ # List of user agents for web search
343
+ _useragent_list = [
344
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
345
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
346
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
347
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
348
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
349
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
350
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
351
+ ]
352
 
353
+ # Get a random user agent from the list
354
+ def get_useragent():
355
+ """Returns a random user agent from the list."""
356
+ return random.choice(_useragent_list)
357
+
358
+ # Extract visible text from HTML content using BeautifulSoup
359
+ def extract_text_from_webpage(html_content):
360
+ """Extracts visible text from HTML content using BeautifulSoup."""
361
+ soup = BeautifulSoup(html_content, "html.parser")
362
+ # Remove unwanted tags
363
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
364
+ tag.extract()
365
+ # Get the remaining visible text
366
+ visible_text = soup.get_text(strip=True)
367
+ return visible_text
368
+
369
+ # Perform a Google search and return the results
370
+ def search(term, num_results=3, lang="en", advanced=True, sleep_interval=0, timeout=5, safe="active", ssl_verify=None):
371
+ """Performs a Google search and returns the results."""
372
+ # Ensure term is a string before parsing
373
+ if isinstance(term, dict):
374
+ term = term.get('text', '') # Get text from user_prompt or default to empty string
375
+ escaped_term = urllib.parse.quote_plus(term)
376
+ start = 0
377
+ all_results = []
378
+ # Fetch results in batches
379
+ while start < num_results:
380
+ resp = requests.get(
381
+ url="https://www.google.com/search",
382
+ headers={"User-Agent": get_useragent()}, # Set random user agent
383
+ params={
384
+ "q": term,
385
+ "num": num_results - start, # Number of results to fetch in this batch
386
+ "hl": lang,
387
+ "start": start,
388
+ "safe": safe,
389
+ },
390
+ timeout=timeout,
391
+ verify=ssl_verify,
392
+ )
393
+ resp.raise_for_status() # Raise an exception if request fails
394
+ soup = BeautifulSoup(resp.text, "html.parser")
395
+ result_block = soup.find_all("div", attrs={"class": "g"})
396
+ # If no results, continue to the next batch
397
+ if not result_block:
398
+ start += 1
399
+ continue
400
+ # Extract link and text from each result
401
+ for result in result_block:
402
+ link = result.find("a", href=True)
403
+ if link:
404
+ link = link["href"]
405
+ try:
406
+ # Fetch webpage content
407
+ webpage = requests.get(link, headers={"User-Agent": get_useragent()})
408
+ webpage.raise_for_status()
409
+ # Extract visible text from webpage
410
+ visible_text = extract_text_from_webpage(webpage.text)
411
+ all_results.append({"link": link, "text": visible_text})
412
+ except requests.exceptions.RequestException as e:
413
+ # Handle errors fetching or processing webpage
414
+ print(f"Error fetching or processing {link}: {e}")
415
+ all_results.append({"link": link, "text": None})
416
+ else:
417
+ all_results.append({"link": None, "text": None})
418
+ start += len(result_block) # Update starting index for next batch
419
+ return all_results
420
+
421
+ # Format the prompt for the language model
422
+ def format_prompt(user_prompt, chat_history):
423
+ prompt = "<s>"
424
+ for item in chat_history:
425
+ if isinstance(item, tuple): # Check if it's a text turn
426
+ prompt += f"[INST] {item[0]} [/INST]"
427
+ prompt += f" {item[1]}</s> "
428
+ elif isinstance(item, str): # Check if it's an image path
429
+ prompt += f"[INST] <image> [/INST] </s> " # Placeholder for image turns
430
+ else:
431
+ print(f"Unexpected type in chat_history: {type(item)}") # Debug output
432
+ prompt += f"[INST] {user_prompt} [/INST]"
433
+ return prompt
434
+
435
+ # Define a function for model inference
436
  @spaces.GPU(duration=30, queue=False)
437
  def model_inference(
438
+ user_prompt,
439
+ chat_history,
440
+ model_selector,
441
+ decoding_strategy,
442
+ temperature,
443
+ max_new_tokens,
444
+ repetition_penalty,
445
+ top_p,
446
+ web_search,
447
  ):
448
+ # Define generation_args at the beginning of the function
449
+ generation_args = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
+ # Web search logic
452
+ if not user_prompt["files"]:
453
+ if web_search is True:
454
+ """Performs a web search, feeds the results to a language model, and returns the answer."""
455
+ web_results = search(user_prompt["text"])
456
+ web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
457
+ # Load the language model
458
+ client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
459
+ generate_kwargs = dict(
460
+ max_new_tokens=4000,
461
+ do_sample=True,
462
+ )
463
+ # Format the prompt for the language model
464
+ formatted_prompt = format_prompt(
465
+ f"""You are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, You are provided with WEB info from which you can find informations to answer. You do not say Unnecesarry things Only say thing which is important and relevant. You also has the ability to generate images. BY Utilizing the following link structure, : ![](https://image.pollinations.ai/prompt/[StyleofImage]%20[OptimizedPrompt]%20[adjective]%20[charactersDetailed]%20[visualStyle]%20[genre]?width=[width]&height=[height]&nologo=poll&nofeed=yes&seed=[random]) For image generation, You replace info inside bracets with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. Note: 1. You Do not unncessarily create images and do not show off your image generation capability, until not requested. 2. You always Give image link in format ![](url) 3. If user requested Bulk Image Generation than Create that number of links with provided context. 4. Always learn from previous conversation. 5. Always try to connect conversation with history. 6. Do not make conversation too long. 7. Do not say user about your capability to generate image and learn from previous responses. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question. [USER] {user_prompt} [WEB] {web2} [OpenGPT 4o]""",
466
+ chat_history)
467
+ # Generate the response from the language model
468
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
469
+ return_full_text=False)
470
+ output = ""
471
+ # Construct the output from the stream of tokens
472
+ for response in stream:
473
+ if not response.token.text == "</s>":
474
+ output += response.token.text
475
+ yield output
476
+ else:
477
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
478
+ generate_kwargs = dict(
479
+ max_new_tokens=5000,
480
+ do_sample=True,
481
+ )
482
+ # Format the prompt for the language model
483
+ formatted_prompt = format_prompt(
484
+ f"""You are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, You do not say Unnecesarry things Only say thing which is important and relevant. You also has the ability to generate images. BY Utilizing the following link structure, : ![](https://image.pollinations.ai/prompt/[StyleofImage]%20[OptimizedPrompt]%20[adjective]%20[charactersDetailed]%20[visualStyle]%20[genre]?width=[width]&height=[height]&nologo=poll&nofeed=yes&seed=[random]) For image generation, You replace info inside bracets with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. Note: 1. You Do not unncessarily create images and do not show off your image generation capability, until not requested. 2. You always Give image link in format ![](url) 3. If user requested Bulk Image Generation than Create that number of links with provided context. 4. Always learn from previous conversation. 5. Always try to connect conversation with history. 6. Do not make conversation too long. 7. Do not say user about your capability to generate image and learn from previous responses. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question. [USER] {user_prompt} [OpenGPT 4o]""",
485
+ chat_history)
486
+ # Generate the response from the language model
487
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
488
+ return_full_text=False)
489
+ output = ""
490
+ # Construct the output from the stream of tokens
491
+ for response in stream:
492
+ if not response.token.text == "</s>":
493
+ output += response.token.text
494
+ yield output
495
+ return
496
+ else:
497
+ if user_prompt["text"].strip() == "" and not user_prompt["files"]:
498
+ gr.Error("Please input a query and optionally an image(s).")
499
+ return # Stop execution if there's an error
500
+
501
+ if user_prompt["text"].strip() == "" and user_prompt["files"]:
502
+ gr.Error("Please input a text query along with the image(s).")
503
+ return # Stop execution if there's an error
504
+
505
+ streamer = TextIteratorStreamer(
506
+ PROCESSOR.tokenizer,
507
+ skip_prompt=True,
508
+ timeout=120.0,
509
+ )
510
+ # Move generation_args initialization here
511
+ generation_args = {
512
+ "max_new_tokens": max_new_tokens,
513
+ "repetition_penalty": repetition_penalty,
514
+ "streamer": streamer,
515
+ }
516
+ assert decoding_strategy in [
517
+ "Greedy",
518
+ "Top P Sampling",
519
+ ]
520
+
521
+ if decoding_strategy == "Greedy":
522
+ generation_args["do_sample"] = False
523
+ elif decoding_strategy == "Top P Sampling":
524
+ generation_args["temperature"] = temperature
525
+ generation_args["do_sample"] = True
526
+ generation_args["top_p"] = top_p
527
+ # Creating model inputs
528
+ (
529
+ resulting_text,
530
+ resulting_images,
531
+ ) = format_user_prompt_with_im_history_and_system_conditioning(
532
+ user_prompt=user_prompt,
533
+ chat_history=chat_history,
534
+ )
535
+ prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
536
+ inputs = PROCESSOR(
537
+ text=prompt,
538
+ images=resulting_images if resulting_images else None,
539
+ return_tensors="pt",
540
+ )
541
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
542
+ generation_args.update(inputs)
543
+ thread = Thread(
544
+ target=MODELS[model_selector].generate,
545
+ kwargs=generation_args,
546
+ )
547
+ thread.start()
548
+ acc_text = ""
549
+ for text_token in streamer:
550
+ time.sleep(0.01)
551
+ acc_text += text_token
552
+ if acc_text.endswith("<end_of_utterance>"):
553
+ acc_text = acc_text[:-18]
554
+ yield acc_text
555
+ return
556
+ # Define features for the dataset
557
  FEATURES = datasets.Features(
558
  {
559
  "model_selector": datasets.Value("string"),
 
564
  "max_new_tokens": datasets.Value("int32"),
565
  "repetition_penalty": datasets.Value("float32"),
566
  "top_p": datasets.Value("int32"),
567
+ }
568
+ )
 
569
 
570
+ # Define hyper-parameters for generation
571
  max_new_tokens = gr.Slider(
572
  minimum=2048,
573
  maximum=16000,
 
616
  info="Higher values are equivalent to sampling more low-probability tokens.",
617
  )
618
 
619
+ # Create a chatbot interface
620
  chatbot = gr.Chatbot(
621
  label="OpnGPT-4o-Chatty",
622
  avatar_images=[None, BOT_AVATAR],
623
+ show_copy_button=True,
624
+ likeable=True,
625
  layout="panel"
626
  )
627
+ output = gr.Textbox(label="Prompt")
628
 
629
+ # Create Gradio blocks for different functionalities
630
 
631
+ # Chat interface block
632
  with gr.Blocks(
633
+ fill_height=True,
634
+ css=""".gradio-container .avatar-container {height: 40px width: 40px !important;} #duplicate-button {margin: auto; color: white; background: #f1a139; border-radius: 100vh; margin-top: 2px; margin-bottom: 2px;}""",
635
  ) as chat:
 
636
  gr.Markdown("# Image Chat, Image Generation, Image classification and Normal Chat")
637
  with gr.Row(elem_id="model_selector_row"):
638
  model_selector = gr.Dropdown(
 
644
  label="Model",
645
  visible=False,
646
  )
 
647
  decoding_strategy.change(
648
  fn=lambda selection: gr.Slider(
649
  visible=(
650
+ selection
651
+ in [
652
+ "contrastive_sampling",
653
+ "beam_sampling",
654
+ "Top P Sampling",
655
+ "sampling_top_k",
656
+ ]
657
  )
658
  ),
659
  inputs=decoding_strategy,
 
664
  inputs=decoding_strategy,
665
  outputs=top_p,
666
  )
 
667
  gr.ChatInterface(
668
  fn=model_inference,
669
  chatbot=chatbot,
 
677
  max_new_tokens,
678
  repetition_penalty,
679
  top_p,
680
+ gr.Checkbox(label="Web Search", value=True), # Add web_search checkbox
681
+ ],
682
  )
683
 
684
+ # Voice chat block
685
+ with gr.Blocks() as voice:
686
+ with gr.Row():
687
+ select = gr.Dropdown(['Nous Hermes Mixtral 8x7B DPO', 'Mixtral 8x7B', 'StarChat2 15b', 'Mistral 7B v0.3',
688
+ 'Phi 3 mini', 'Zephyr 7b'], value="Mistral 7B v0.3", label="Select Model")
689
  seed = gr.Slider(
690
+ label="Seed",
691
+ minimum=0,
692
+ maximum=999999,
693
+ step=1,
694
+ value=0,
695
+ visible=False
696
  )
697
  input = gr.Audio(label="User", sources="microphone", type="filepath", waveform_options=False)
698
  output = gr.Audio(label="AI", type="filepath",
699
+ interactive=False,
700
+ autoplay=True,
701
+ elem_classes="audio")
702
  gr.Interface(
703
+ fn=respond,
704
+ inputs=[input, select, seed],
705
+ outputs=[output], api_name="translate", live=True)
706
 
707
+ # Live chat block
708
  with gr.Blocks() as livechat:
709
  gr.Interface(
710
  fn=videochat,