import streamlit as st from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer from PIL import Image # Load the pre-trained model and processor model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Streamlit app title st.title("Image to Text App") # File uploader uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: # Load and display the image image = Image.open(uploaded_file) st.image(image, caption='Uploaded Image', use_column_width=True) # Process the image pixel_values = processor(images=image, return_tensors="pt").pixel_values # Generate text output_ids = model.generate(pixel_values) text = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Display the generated text st.write("Generated Text:") st.write(text)