image.to.txt / requirements.txt
123LETSPLAY's picture
Create requirements.txt
f22cbe7 verified
raw
history blame
784 Bytes
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
# Load the pre-trained model and processor
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# Load an image
image_path = "path/to/your/image.jpg" # Update with your image path
image = Image.open(image_path)
# Process the image
pixel_values = processor(images=image, return_tensors="pt").pixel_values
# Generate text
output_ids = model.generate(pixel_values)
text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Print the extracted text
print(text)