import torch
import gradio as gr
from PIL import Image
from gtts import gTTS
from transformers import BlipProcessor, BlipForConditionalGeneration
model = "Salesforce/blip-image-captioning-large"
processor = BlipProcessor.from_pretrained(model)
head = BlipForConditionalGeneration.from_pretrained(model)
def predict(image):
inputs = processor(image, return_tensors="pt")
output = head.generate(**inputs)
caption = processor.decode(output[0], skip_special_tokens=True)
audio = gTTS(caption, lang="en", tld="co.in")
audio.save('caption.mp3')
filepath = 'caption.mp3'
return caption, filepath
inp = gr.inputs.Image(label="Upload any Image")
outputs = [
gr.components.Textbox(type="text",label="Captions"),
gr.components.Audio(type="filepath",label="audio")
]
description = """
🔉 EchoSense Image to Audio Playground
This spaces helps generate audio descriptions for input Images
Please note:This space is for demonstration purposes only.
Visit Shreyas Dixit's personal website for more information about the creator.
"""
article="""Echo Sense is an innovative image captioning application that utilizes cutting-edge technology, specifically the powerful Transformer Model Architecture. This state-of-the-art approach has revolutionized Natural Language Processing (NLP) tasks, including image captioning, making it highly accurate and efficient. By leveraging pretrained models from Hugging Face and fine-tuning them on the COCO dataset, Echo Sense achieves exceptional performance while significantly reducing the computational cost and training time. The result is a versatile and reliable solution that not only produces accurate image captions but also generalizes well across various tasks. Experience the power of Echo Sense and witness firsthand the remarkable capabilities of the Transformer Model Architecture."""
interface = gr.Interface(
fn=predict,
inputs=inp,
outputs=outputs,
title="",
description=description,
article=article,
theme="grass",
font=[
gr.themes.GoogleFont("Open Sans"),
"ui-sans-serif",
"system-ui",
"sans-serif",
],
)
interface.launch()