Spaces:

thivav
/

image-to-audio

Runtime error

App Files Files Community

thivav commited on Dec 25, 2023

Commit

c826025

•

1 Parent(s): 7143f85

image-to-audio init commit

Browse files

Files changed (7) hide show

.gitignore +6 -1
README.md +19 -2
app.py +178 -0
audio/.placeholder +0 -0
ref/gpt_chatbot_with_langchain.py +21 -0
ref/text-to-audio.py +23 -0
requirements.txt +5 -0

.gitignore CHANGED Viewed

@@ -157,4 +157,9 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+.vscode/
+audio/*.wav
+img/*.jpeg
+img/*.jpg

README.md CHANGED Viewed

@@ -1,2 +1,19 @@
-# image_to_audio
-image_to_audio

+# The Image Reader 📢
+[The Image Reader 📢 - Playground](www.google.com)
+This application analyzes the uploaded image, generates an imaginative phrase, and then converts it into audio.
+- For **image_to_audio** following technologies were used:
+    - **Image Reader:**
+        - HuggingFace ```image-to-text``` task used with ```Salesforce/blip-image-captioning-base``` pretrained model. Which produces a small description about the image.
+        - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
+            - BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
+    - **Generate an imaginative phrase:**
+        - OpenAI ```GPT-3.5-Turbo``` used to produce an imaginative narrative from the description generated earlier.
+        - The phrase generated with more than 40 words.
+        - [GPT-3.5 Turbo](https://openai.com/blog/gpt-3-5-turbo-fine-tuning-and-api-updates)
+    - **text-to-audio:**
+        - ```suno/bark-small``` used to generate the audio version of the imaginative narrative earlier.
+        - [suno/bark-small](https://huggingface.co/suno/bark-small)
+            - **BARK**: Bark is a transformer-based text-to-audio model created by [Suno](https://www.suno.ai/). Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying.

app.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+import scipy
+import streamlit as st
+import torch
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
+from langchain_core.messages import SystemMessage
+from transformers import AutoProcessor, BarkModel, pipeline
+# create image-to-text pipeline
+@st.cache_resource
+def create_image_to_text_pipeline():
+    """create image to text pipeline"""
+    task = "image-to-text"
+    model = "Salesforce/blip-image-captioning-base"
+    img_to_text_pipeline = pipeline(task, model=model)
+    return img_to_text_pipeline
+# generate information about the image
+def image_to_text(url):
+    """image to text"""
+    generate_kwargs = {
+        "do_sample": True,
+        "temperature": 0.7,
+        "max_new_tokens": 256,
+    }
+    pipe = create_image_to_text_pipeline()
+    txt = pipe(url, generate_kwargs=generate_kwargs)[0]["generated_text"]
+    return txt
+# load language models
+@st.cache_resource
+def load_llm_model(openai_key):
+    """load llm model"""
+    model = ChatOpenAI(
+        model_name="gpt-3.5-turbo", openai_api_key=openai_key, temperature=0
+    )
+    return model
+# generate audio script
+def generate_audio_script(openai_key, scenario):
+    """generate audio script"""
+    chat_template = ChatPromptTemplate.from_messages(
+        [
+            SystemMessage(
+                content=(
+                    "You are a story teller. "
+                    "You can generate a story based on a simple narrative, "
+                    "the story be no more than 40 words."
+                )
+            ),
+            HumanMessagePromptTemplate.from_template("{scenario}"),
+        ]
+    )
+    llm_model = load_llm_model(openai_key)
+    ai_response = llm_model(chat_template.format_messages(scenario=scenario))
+    script = ai_response.content
+    return script
+# load audio pipeline
+@st.cache_resource
+def load_audio_pipeline():
+    """load audio pipeline"""
+    synthesiser = BarkModel.from_pretrained("suno/bark-small")
+    audio_processor = AutoProcessor.from_pretrained("suno/bark")
+    return synthesiser, audio_processor
+def generate_audio(script):
+    """generate audio"""
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    print("Device: ", device)
+    print("Script: ", script)
+    model, processor = load_audio_pipeline()
+    inputs = processor(script)
+    model = model.to(device)
+    speech_output = model.generate(**inputs.to(device))
+    sampling_rate = model.generation_config.sample_rate
+    scipy.io.wavfile.write(
+        "audio/bark_output.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy()
+    )
+def main():
+    """main"""
+    st.set_page_config(
+        page_title="Image to Speech",
+        page_icon="📢",
+        layout="centered",
+        initial_sidebar_state="collapsed",
+    )
+    st.header("The Image Reader 📢", divider="rainbow")
+    st.subheader(
+        "This application :red[analyzes] the uploaded image, generates an :green[imaginative phrase], and then converts it into :blue[audio] :sunglasses:"
+    )
+    st.markdown("[check out the repository](https://github.com/ThivaV/image_to_audio)")
+    openai_key = st.text_input("Enter your OpenAI key 👇", type="password")
+    progress_bar_message = "Operation in progress. Please wait."
+    uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg"])
+    if uploaded_image is not None:
+        progress_bar = st.progress(0, text=progress_bar_message)
+        # rename all the uploaded images to "uploaded_image"
+        image_ext = os.path.splitext(uploaded_image.name)[1]
+        new_image_name = "uploaded_image" + image_ext
+        image_save_path = "img/" + new_image_name
+        byte_data = uploaded_image.getvalue()
+        with open(image_save_path, "wb") as file:
+            file.write(byte_data)
+        # 10% completed
+        progress_bar.progress(10, text=progress_bar_message)
+        col_1, col_2 = st.columns([6, 4])
+        with col_1:
+            st.image(uploaded_image, caption="Uploaded image.", use_column_width=True)
+        # 20% completed
+        progress_bar.progress(20, text=progress_bar_message)
+        scenario = image_to_text(image_save_path)
+        # 40% completed
+        progress_bar.progress(40, text=progress_bar_message)
+        script = generate_audio_script(openai_key, scenario)
+        # 60% completed
+        progress_bar.progress(60, text=progress_bar_message)
+        generate_audio(script)
+        # 90% completed
+        progress_bar.progress(90, text=progress_bar_message)
+        with col_2:
+            with st.expander("About the image"):
+                st.write(scenario)
+            with st.expander("Script"):
+                st.write(script)
+            st.audio("audio/bark_output.wav")
+        # 100% completed
+        progress_bar.progress(
+            100, text="Operation completed. Thank you for your patients."
+        )
+if __name__ == "__main__":
+    main()

audio/.placeholder ADDED Viewed

File without changes

ref/gpt_chatbot_with_langchain.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import HumanMessagePromptTemplate
+from langchain_core.messages import SystemMessage
+from langchain.prompts import ChatPromptTemplate
+chat_template = ChatPromptTemplate.from_messages(
+    [
+        SystemMessage(
+            content=(
+                "You are a story teller. "
+                "You can generate a story based on a simple narrative, "
+                "the story be no more than 20 words."
+            )
+        ),
+        HumanMessagePromptTemplate.from_template("{text}")
+    ]
+)
+llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
+resp = llm(chat_template.format_messages(text="white wool shirt"))
+print(resp.content)

ref/text-to-audio.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# text-to-audio using suno/bark
+import scipy
+import torch
+from transformers import AutoProcessor
+from transformers import BarkModel
+model = BarkModel.from_pretrained("suno/bark-small")
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+print("device: ", device)
+processor = AutoProcessor.from_pretrained("suno/bark")
+# prepare the inputs
+text_prompt = "You are a story teller. You can generate a story based on a simple narrative, the story be no more than 20 words."
+inputs = processor(text_prompt)
+# generate speech
+model = model.to(device)
+speech_output = model.generate(**inputs.to(device))
+sampling_rate = model.generation_config.sample_rate
+scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy())

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+scipy
+transformers[torch]
+langchain==0.0.352
+openai==1.6.1