import torch import gradio as gr from PIL import Image from gtts import gTTS from transformers import BlipProcessor, BlipForConditionalGeneration model = "Salesforce/blip-image-captioning-large" processor = BlipProcessor.from_pretrained(model) head = BlipForConditionalGeneration.from_pretrained(model) def predict(image): inputs = processor(image, return_tensors="pt") output = head.generate(**inputs) caption = processor.decode(output[0], skip_special_tokens=True) audio = gTTS(caption, lang="en", tld="co.in") audio.save('caption.mp3') filepath = 'caption.mp3' return caption, filepath inp = gr.inputs.Image(label="Upload any Image") outputs = [ gr.components.Textbox(type="text",label="Captions"), gr.components.Audio(type="filepath",label="audio") ] description = """

🔉 EchoSense Image to Audio Playground

This spaces helps generate audio descriptions for input Images

Please note:This space is for demonstration purposes only.

Visit Shreyas Dixit's personal website for more information about the creator.

""" article="""Echo Sense is an innovative image captioning application that utilizes cutting-edge technology, specifically the powerful Transformer Model Architecture. This state-of-the-art approach has revolutionized Natural Language Processing (NLP) tasks, including image captioning, making it highly accurate and efficient. By leveraging pretrained models from Hugging Face and fine-tuning them on the COCO dataset, Echo Sense achieves exceptional performance while significantly reducing the computational cost and training time. The result is a versatile and reliable solution that not only produces accurate image captions but also generalizes well across various tasks. Experience the power of Echo Sense and witness firsthand the remarkable capabilities of the Transformer Model Architecture.""" interface = gr.Interface( fn=predict, inputs=inp, outputs=outputs, title="", description=description, article=article, theme="grass", font=[ gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif", ], ) interface.launch()