import gradio as gr from transformers import AutoModelForCausalLM, AutoProcessor from PIL import Image import torch # Load model and processor model = AutoModelForCausalLM.from_pretrained("mynkchaudhry/Florence-2-FT-DocVQA", trust_remote_code=True) processor = AutoProcessor.from_pretrained("mynkchaudhry/Florence-2-FT-DocVQA") def generate_response(image, question): try: if image.mode != "RGB": image = image.convert("RGB") inputs = processor(text=question, images=image, return_tensors="pt") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) inputs = {key: value.to(device) for key, value in inputs.items()} generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_length=1024, num_beams=3, early_stopping=True ) response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return response except Exception as e: return f"Error processing image: {e}" # Example images for demonstration (update paths as needed) examples = [ ["demo.png", "what is the address in the page?"], ["demo2.jpg", "what is the date in the page?"], ["demo.png", "what is the name in the page?"] ] # Gradio interface iface = gr.Interface( fn=generate_response, inputs=[gr.Image(type="pil"), gr.Textbox(label="Question")], outputs=gr.Textbox(label="Response"), examples=examples, title="Image to Text Extractor", description="Upload an image and provide a question. This tool will extract the relevant information from the image based on your question." ) # Launch the interface iface.launch()