File size: 2,654 Bytes
7efd637
c33dbd2
ca8dc25
c33dbd2
ca8dc25
c33dbd2
 
e98c6cb
c33dbd2
 
 
d107cdf
c33dbd2
 
9dc7fb7
c33dbd2
 
 
 
 
 
 
 
 
 
 
 
 
 
6a8b740
c33dbd2
 
 
 
 
6719d1c
c33dbd2
 
 
 
 
81fc138
c33dbd2
 
 
3807c9a
c33dbd2
 
 
 
82ee039
c33dbd2
 
5e6f5c8
c33dbd2
 
 
 
 
 
 
 
 
 
 
 
4d26ed5
7efd637
c33dbd2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
import os
from together import Together
from typing import List, Tuple

# Initialize Together client
client = Together()

# Ensure API key is set
if "TOGETHER_API_KEY" not in os.environ:
    raise ValueError("Please set the TOGETHER_API_KEY environment variable")

def call_llama_vision_api(prompt: str, image_url: str) -> str:
    getDescriptionPrompt = "You are a UX/UI designer. Describe the attached screenshot or UI mockup in detail. I will feed in the output you give me to a coding model that will attempt to recreate this mockup, so please think step by step and describe the UI in detail. Pay close attention to background color, text color, font size, font family, padding, margin, border, etc. Match the colors and sizes exactly. Make sure to mention every part of the screenshot including any headers, footers, etc. Use the exact text from the screenshot."

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": getDescriptionPrompt + "\n\n" + prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url,
                    },
                },
            ],
        }
    ]

    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
        messages=messages,
        stream=True,
    )

    response = ""
    for chunk in stream:
        content = chunk.choices[0].delta.content or ""
        response += content
        yield response

def chat(message: str, history: List[Tuple[str, str]], image_url: str) -> Tuple[str, List[Tuple[str, str]]]:
    if not message:
        return "", history

    full_response = ""
    for partial_response in call_llama_vision_api(message, image_url):
        full_response = partial_response
        yield "", history + [(message, full_response)]

    history.append((message, full_response))
    return "", history

# Define the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Llama 3.2 Vision Chatbot Demo")
    gr.Markdown("Enter your message and an image URL to analyze using the Llama 3.2 Vision model.")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Your message")
    image_url = gr.Textbox(label="Image URL", value="https://napkinsdev.s3.us-east-1.amazonaws.com/next-s3-uploads/d96a3145-472d-423a-8b79-bca3ad7978dd/trello-board.png")
    
    clear = gr.Button("Clear")
    
    msg.submit(chat, [msg, chatbot, image_url], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch()