Gradio Demo addition to repo

#2
by CoderCowMoo - opened

I've been trying to port over the moondream gradio demo (apache 2.0), but I've been getting this error after trying to add streaming, absolutely befuddled since the input hasn't changed.

import torch
import transformers
import gradio as gr
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import warnings

# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')


# set device
torch.set_default_device('cpu')  # or 'cpu'

# create model
model = AutoModelForCausalLM.from_pretrained(
    'qnguyen3/nanoLLaVA',
    torch_dtype=torch.float16,
    device_map='cpu',
    trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
    'qnguyen3/nanoLLaVA',
    trust_remote_code=True)

def answer_question(img, prompt):
    # nanoLLaVA prompt tokenization stuff
    messages = [
        {"role": "user", "content": f'<image>\n{prompt}'}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    print(text)

    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
    image_tensor = model.process_images([img], model.config).to(dtype=model.dtype)
    
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    thread = Thread(
        target=model.generate,
        kwargs={
            "input_ids": input_ids,
            "images": image_tensor,
            "max_new_tokens": 2048,
            "use_cache": True,
            "streamer": streamer,
        },
    )
    thread.start()

    buffer = ""
    for new_text in streamer:
        print(new_text)
        text = tokenizer.decode(new_text[input_ids.shape[1]:], skip_special_tokens=True).strip()
        buffer += text
        yield buffer

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # NanoLLaVA
        ### A tiny vision language model. [GitHub](https://huggingface.co/qnguyen3/nanoLLaVA)
        """
    )
    with gr.Row():
        prompt = gr.Textbox(label="Input Prompt", placeholder="Type here...", scale=4)
        submit = gr.Button("Submit")
    with gr.Row():
        img = gr.Image(type="pil", label="Upload an Image")
        output = gr.TextArea(label="Response")
    submit.click(answer_question, [img, prompt], output)
    prompt.submit(answer_question, [img, prompt], output)

demo.queue().launch(debug=True)

Any help would be appreciated.
I'm also unable to run on cuda, since the image features somehow still are on cpu even after explicit image_tensor = image_tensor.to(device), so the script uses cpu.

Very happy to say, got a working demo, for both streaming and non-streaming.

Streaming: https://gist.github.com/CoderCowMoo/735e84e35ca3b68a1125f738bf72f096
Non-streaming: https://gist.github.com/CoderCowMoo/2f84e594f950de1954276a6d031635fa

Sign up or log in to comment