🔉 EchoSense Image to Audio Playground

import torch
import gradio as gr
from PIL import Image
from gtts import gTTS
from transformers import BlipProcessor, BlipForConditionalGeneration

model = "Salesforce/blip-image-captioning-large"
processor = BlipProcessor.from_pretrained(model)
head = BlipForConditionalGeneration.from_pretrained(model)

def predict(image):
    inputs = processor(image, return_tensors="pt")
    output = head.generate(**inputs)
    caption = processor.decode(output[0], skip_special_tokens=True)
    audio = gTTS(caption, lang="en", tld="co.in")
    audio.save('caption.mp3')
    filepath = 'caption.mp3'
    return caption, filepath

inp = gr.inputs.Image(label="Upload any Image")
outputs = [
    gr.components.Textbox(type="text",label="Captions"),
    gr.components.Audio(type="filepath",label="audio")
]

description = """<div style="text-align: center;">
    <h1>🔉 EchoSense <span style='color: #e6b800;'>Image to Audio</span> Playground</h1>
    <p>This spaces helps generate audio descriptions for input Images</p>
    <p><b>Please note:</b>This space is for demonstration purposes only.</p>
    <p>Visit <a herf="https://shreyasdixit.tech">Shreyas Dixit's</a> personal website for more information about the creator.</p>
</div>"""

article="""Echo Sense is an innovative image captioning application that utilizes cutting-edge technology, specifically the powerful Transformer Model Architecture. This state-of-the-art approach has revolutionized Natural Language Processing (NLP) tasks, including image captioning, making it highly accurate and efficient. By leveraging pretrained models from Hugging Face and fine-tuning them on the COCO dataset, Echo Sense achieves exceptional performance while significantly reducing the computational cost and training time. The result is a versatile and reliable solution that not only produces accurate image captions but also generalizes well across various tasks. Experience the power of Echo Sense and witness firsthand the remarkable capabilities of the Transformer Model Architecture."""

interface = gr.Interface(
    fn=predict,
    inputs=inp,
    outputs=outputs,
    title="",
    description=description,
    article=article,
    theme="grass",
    font=[
        gr.themes.GoogleFont("Open Sans"),
        "ui-sans-serif",
        "system-ui",
        "sans-serif",
    ],
)
interface.launch()