Llama-Vision-Together

Running

App Files Files Community

akhaliq HF staff commited on 21 days ago

Commit

5ee7ec4

•

1 Parent(s): c33dbd2

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -25

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import gradio as gr
 import os
 from together import Together
-from typing import List, Tuple
 # Initialize Together client
 client = Together()
@@ -10,9 +12,15 @@ client = Together()
 if "TOGETHER_API_KEY" not in os.environ:
     raise ValueError("Please set the TOGETHER_API_KEY environment variable")
-def call_llama_vision_api(prompt: str, image_url: str) -> str:
     getDescriptionPrompt = "You are a UX/UI designer. Describe the attached screenshot or UI mockup in detail. I will feed in the output you give me to a coding model that will attempt to recreate this mockup, so please think step by step and describe the UI in detail. Pay close attention to background color, text color, font size, font family, padding, margin, border, etc. Match the colors and sizes exactly. Make sure to mention every part of the screenshot including any headers, footers, etc. Use the exact text from the screenshot."
     messages = [
         {
             "role": "user",
@@ -21,7 +29,7 @@ def call_llama_vision_api(prompt: str, image_url: str) -> str:
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": image_url,
                     },
                 },
             ],
@@ -38,33 +46,29 @@ def call_llama_vision_api(prompt: str, image_url: str) -> str:
     for chunk in stream:
         content = chunk.choices[0].delta.content or ""
         response += content
-        yield response
-def chat(message: str, history: List[Tuple[str, str]], image_url: str) -> Tuple[str, List[Tuple[str, str]]]:
-    if not message:
-        return "", history
-    full_response = ""
-    for partial_response in call_llama_vision_api(message, image_url):
-        full_response = partial_response
-        yield "", history + [(message, full_response)]
-    history.append((message, full_response))
-    return "", history
-# Define the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Llama 3.2 Vision Chatbot Demo")
-    gr.Markdown("Enter your message and an image URL to analyze using the Llama 3.2 Vision model.")
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox(label="Your message")
-    image_url = gr.Textbox(label="Image URL", value="https://napkinsdev.s3.us-east-1.amazonaws.com/next-s3-uploads/d96a3145-472d-423a-8b79-bca3ad7978dd/trello-board.png")
-    clear = gr.Button("Clear")
-    msg.submit(chat, [msg, chatbot, image_url], [msg, chatbot])
-    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from gradio_multimodalchatbot import MultimodalChatbot
+from gradio.data_classes import FileData
 import os
 from together import Together
+import base64
 # Initialize Together client
 client = Together()
 if "TOGETHER_API_KEY" not in os.environ:
     raise ValueError("Please set the TOGETHER_API_KEY environment variable")
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def call_llama_vision_api(prompt: str, image_path: str) -> str:
     getDescriptionPrompt = "You are a UX/UI designer. Describe the attached screenshot or UI mockup in detail. I will feed in the output you give me to a coding model that will attempt to recreate this mockup, so please think step by step and describe the UI in detail. Pay close attention to background color, text color, font size, font family, padding, margin, border, etc. Match the colors and sizes exactly. Make sure to mention every part of the screenshot including any headers, footers, etc. Use the exact text from the screenshot."
+    base64_image = encode_image(image_path)
     messages = [
         {
             "role": "user",
                 {
                     "type": "image_url",
                     "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}"
                     },
                 },
             ],
     for chunk in stream:
         content = chunk.choices[0].delta.content or ""
         response += content
+    return response
+def chat(message, history):
+    user_message = message["text"]
+    files = message.get("files", [])
+    if files and files[0]["file"].path:
+        image_path = files[0]["file"].path
+        response = call_llama_vision_api(user_message, image_path)
+    else:
+        response = "I'm sorry, but I need an image to analyze. Please upload an image along with your question."
+    return {"text": response, "files": []}
 with gr.Blocks() as demo:
+    gr.Markdown("# Llama 3.2 Vision Multimodal Chatbot Demo")
+    gr.Markdown("Upload an image and enter your message to analyze using the Llama 3.2 Vision model.")
+    chatbot = MultimodalChatbot(
+        value=[],
+        height=800,
+        callback=chat,
+    )
 if __name__ == "__main__":
     demo.launch()