import gradio as gr import torch from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info from PIL import Image from datetime import datetime import numpy as np import os # Function to save image array as a file and return the path def array_to_image_path(image_array): img = Image.fromarray(np.uint8(image_array)) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"image_{timestamp}.png" img.save(filename) return os.path.abspath(filename) # Load model and processor model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.float32, device_map="cpu" ).eval() processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") DESCRIPTION = "[Qwen2-VL-2B Demo (CPU Version)](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)" def run_example(image, text_input): image_path = array_to_image_path(image) image = Image.fromarray(image).convert("RGB") messages = [ { "role": "user", "content": [ { "type": "image", "image": image_path, }, {"type": "text", "text": text_input}, ], } ] # Preparation for inference text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) # Inference: Generation of the output with torch.no_grad(): generated_ids = model.generate(**inputs, max_new_tokens=128) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output_text[0] css = """ #output { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as demo: gr.Markdown(DESCRIPTION) with gr.Tab(label="Qwen2-VL-2B Input (CPU)"): with gr.Row(): with gr.Column(): input_img = gr.Image(label="Input Picture") text_input = gr.Textbox(label="Question") submit_btn = gr.Button(value="Submit") with gr.Column(): output_text = gr.Textbox(label="Output Text") submit_btn.click(run_example, [input_img, text_input], [output_text]) commandline_args = os.getenv("COMMANDLINE_ARGS", "") # Enable or disable queue based on commandline_args if "--no-gradio-queue" not in commandline_args: demo.queue(api_open=False) demo.launch(inline=False, server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), debug=True)