|
import gradio as gr |
|
from PIL import Image |
|
import requests |
|
import os |
|
from together import Together |
|
import base64 |
|
from threading import Thread |
|
import time |
|
|
|
|
|
client = Together() |
|
|
|
|
|
if "TOGETHER_API_KEY" not in os.environ: |
|
raise ValueError("Please set the TOGETHER_API_KEY environment variable") |
|
|
|
def encode_image(image_path): |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
def bot_streaming(message, history, max_new_tokens=250): |
|
txt = message["text"] |
|
messages = [] |
|
images = [] |
|
|
|
for i, msg in enumerate(history): |
|
if isinstance(msg[0], tuple): |
|
messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(msg[0][0])}"}}]}) |
|
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]}) |
|
elif isinstance(history[i-1], tuple) and isinstance(msg[0], str): |
|
pass |
|
elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): |
|
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]}) |
|
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]}) |
|
|
|
if len(message["files"]) == 1: |
|
if isinstance(message["files"][0], str): |
|
image_path = message["files"][0] |
|
else: |
|
image_path = message["files"][0]["path"] |
|
messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_path)}"}}]}) |
|
else: |
|
messages.append({"role": "user", "content": [{"type": "text", "text": txt}]}) |
|
|
|
stream = client.chat.completions.create( |
|
model="meta-llama/Llama-Vision-Free", |
|
messages=messages, |
|
max_tokens=max_new_tokens, |
|
stream=True, |
|
) |
|
|
|
buffer = "" |
|
for chunk in stream: |
|
if chunk.choices[0].delta.content is not None: |
|
buffer += chunk.choices[0].delta.content |
|
time.sleep(0.01) |
|
yield buffer |
|
|
|
demo = gr.ChatInterface( |
|
fn=bot_streaming, |
|
title="Meta Llama 3.2 Vision 11B", |
|
textbox=gr.MultimodalTextbox(), |
|
additional_inputs=[ |
|
gr.Slider( |
|
minimum=10, |
|
maximum=500, |
|
value=250, |
|
step=10, |
|
label="Maximum number of new tokens to generate", |
|
) |
|
], |
|
cache_examples=False, |
|
description="Try Multimodal Llama by Meta with the Together API in this demo. Upload an image, and start chatting about it", |
|
stop_btn="Stop Generation", |
|
fill_height=True, |
|
multimodal=True |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |