Spaces:
Configuration error
Configuration error
MasonCrinr
commited on
Commit
•
ce8fc87
1
Parent(s):
6f6ee27
Upload 5 files
Browse files- .dummy_env +2 -0
- LICENSE +21 -0
- README.md +48 -12
- app.py +54 -0
- requirements.txt +0 -0
.dummy_env
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
HUGGINGFACEHUB_API_TOKEN = HUGGINGFACEHUB_API_TOKEN
|
2 |
+
OPENAI_API_KEY = OPENAI_API_KEY
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Sartaj Bhuvaji
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1,48 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Gradio App: Image to Story Generator
|
2 |
+
|
3 |
+
This Gradio app allows you to upload an image, and it will generate a short story based on the image's content using image captioning. The generated story is then converted to audio using text-to-speech technology. You can both see the generated story and listen to it.
|
4 |
+
|
5 |
+
# Demo
|
6 |
+
- Launching the application
|
7 |
+
![01](https://github.com/SartajBhuvaji/Image-to-Story-Generator/assets/31826483/984ad132-14eb-4ddf-8e5a-33fe2a7c7b28)
|
8 |
+
|
9 |
+
- Select an image and Upload
|
10 |
+
![02](https://github.com/SartajBhuvaji/Image-to-Story-Generator/assets/31826483/20ef38ee-562f-4cfa-9d64-3f01e85f231b)
|
11 |
+
|
12 |
+
- Image
|
13 |
+
![beach (1)](https://github.com/SartajBhuvaji/Image-to-Story-Generator/assets/31826483/69a5b52b-c6dd-41cb-889b-486977ebf37c)
|
14 |
+
|
15 |
+
|
16 |
+
- Download the audio story
|
17 |
+
|
18 |
+
https://github.com/SartajBhuvaji/Image-to-Story-Generator/assets/31826483/1fe00f34-9716-4047-9b57-a7794524816a
|
19 |
+
|
20 |
+
|
21 |
+
## Features
|
22 |
+
|
23 |
+
- Upload an image.
|
24 |
+
- Generate a story based on the content of the image.
|
25 |
+
- Listen to the generated story as an audio file.
|
26 |
+
|
27 |
+
## Usage
|
28 |
+
|
29 |
+
1. Clone this repository to your local machine.
|
30 |
+
|
31 |
+
```bash
|
32 |
+
git clone https://github.com/SartajBhuvaji/Image-to-Story-Generator.git
|
33 |
+
|
34 |
+
pip install -r requirements.txt
|
35 |
+
|
36 |
+
python app.py
|
37 |
+
```
|
38 |
+
`Create a .env file and paste your HUGGINGFACE, OPEN AI API Keys (Check the dummy_env file)`
|
39 |
+
|
40 |
+
`Open your web browser and navigate to http://localhost:7860 to access the app.`
|
41 |
+
|
42 |
+
`Upload an image to the app and click "Generate Story." You will see the generated story and be able to listen to it as audio.`
|
43 |
+
|
44 |
+
# Tech
|
45 |
+
- HuggingFace
|
46 |
+
- Image to Caption model
|
47 |
+
- Chat GPT 3.5 LLM
|
48 |
+
- Text-to-speech
|
app.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import dotenv
|
3 |
+
from transformers import pipeline
|
4 |
+
from langchain import LLMChain
|
5 |
+
from langchain.llms import OpenAI
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
import requests
|
8 |
+
import os
|
9 |
+
|
10 |
+
dotenv.load_dotenv()
|
11 |
+
|
12 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
13 |
+
|
14 |
+
# image to text
|
15 |
+
def imgToText(url):
|
16 |
+
img_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
17 |
+
text = img_to_text(url)[0]['generated_text']
|
18 |
+
return text
|
19 |
+
|
20 |
+
# LLM
|
21 |
+
def generate_story(scenario):
|
22 |
+
template = """
|
23 |
+
You are a story teller.
|
24 |
+
You can generate a short story based on a simple narrative, the story should be no more than 40 words:
|
25 |
+
|
26 |
+
CONTEXT: {scenario}
|
27 |
+
STORY:
|
28 |
+
"""
|
29 |
+
prompt = PromptTemplate(template=template, input_variables=["scenario"])
|
30 |
+
story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo"), prompt=prompt, verbose=True)
|
31 |
+
story = story_llm.predict(scenario=scenario)
|
32 |
+
return story
|
33 |
+
|
34 |
+
def textToSpeech(story):
|
35 |
+
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
|
36 |
+
headers = {"Authorization": "Bearer " + HUGGINGFACEHUB_API_TOKEN}
|
37 |
+
payload = {"inputs": story}
|
38 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
39 |
+
with open("story.flac", "wb") as f:
|
40 |
+
f.write(response.content)
|
41 |
+
|
42 |
+
def generate_story_and_play_audio(image):
|
43 |
+
scenario = imgToText(image.name)
|
44 |
+
story = generate_story(scenario)
|
45 |
+
textToSpeech(story)
|
46 |
+
return "story.flac"
|
47 |
+
|
48 |
+
iface = gr.Interface(
|
49 |
+
fn=generate_story_and_play_audio,
|
50 |
+
inputs=gr.inputs.File(label="Upload an image"),
|
51 |
+
outputs=gr.outputs.Audio(label="Generated Story", type="filepath")
|
52 |
+
)
|
53 |
+
|
54 |
+
iface.launch()
|
requirements.txt
ADDED
Binary file (290 Bytes). View file
|
|