Spaces:
Runtime error
Runtime error
import os | |
import scipy | |
import streamlit as st | |
import torch | |
from langchain.chat_models import ChatOpenAI | |
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate | |
from langchain_core.messages import SystemMessage | |
from transformers import AutoProcessor, BarkModel, pipeline | |
# create image-to-text pipeline | |
def create_image_to_text_pipeline(): | |
"""create image to text pipeline""" | |
task = "image-to-text" | |
model = "Salesforce/blip-image-captioning-base" | |
img_to_text_pipeline = pipeline(task, model=model) | |
return img_to_text_pipeline | |
# generate information about the image | |
def image_to_text(url): | |
"""image to text""" | |
generate_kwargs = { | |
"do_sample": True, | |
"temperature": 0.7, | |
"max_new_tokens": 256, | |
} | |
pipe = create_image_to_text_pipeline() | |
txt = pipe(url, generate_kwargs=generate_kwargs)[0]["generated_text"] | |
return txt | |
# load language models | |
def load_llm_model(openai_key): | |
"""load llm model""" | |
model = ChatOpenAI( | |
model_name="gpt-3.5-turbo", openai_api_key=openai_key, temperature=0 | |
) | |
return model | |
# generate audio script | |
def generate_audio_script(openai_key, scenario): | |
"""generate audio script""" | |
chat_template = ChatPromptTemplate.from_messages( | |
[ | |
SystemMessage( | |
content=( | |
"You are a story teller. " | |
"You can generate a story based on a simple narrative, " | |
"the story be no more than 40 words." | |
) | |
), | |
HumanMessagePromptTemplate.from_template("{scenario}"), | |
] | |
) | |
llm_model = load_llm_model(openai_key) | |
ai_response = llm_model(chat_template.format_messages(scenario=scenario)) | |
script = ai_response.content | |
return script | |
# load audio pipeline | |
def load_audio_pipeline(): | |
"""load audio pipeline""" | |
synthesiser = BarkModel.from_pretrained("suno/bark-small") | |
audio_processor = AutoProcessor.from_pretrained("suno/bark") | |
return synthesiser, audio_processor | |
def generate_audio(script): | |
"""generate audio""" | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
print("Device: ", device) | |
print("Script: ", script) | |
model, processor = load_audio_pipeline() | |
inputs = processor(script) | |
model = model.to(device) | |
speech_output = model.generate(**inputs.to(device)) | |
sampling_rate = model.generation_config.sample_rate | |
scipy.io.wavfile.write( | |
"audio/bark_output.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy() | |
) | |
def main(): | |
"""main""" | |
st.set_page_config( | |
page_title="Image to Speech", | |
page_icon="π’", | |
layout="centered", | |
initial_sidebar_state="collapsed", | |
) | |
st.header("The Image Reader π’", divider="rainbow") | |
st.subheader( | |
"This application :red[analyzes] the uploaded image, generates an :green[imaginative phrase], and then converts it into :blue[audio] :sunglasses:" | |
) | |
st.markdown("[check out the repository](https://github.com/ThivaV/image_to_audio)") | |
openai_key = st.text_input("Enter your OpenAI key π", type="password") | |
progress_bar_message = "Operation in progress. Please wait." | |
uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg"]) | |
if uploaded_image is not None: | |
progress_bar = st.progress(0, text=progress_bar_message) | |
# rename all the uploaded images to "uploaded_image" | |
image_ext = os.path.splitext(uploaded_image.name)[1] | |
new_image_name = "uploaded_image" + image_ext | |
image_save_path = "img/" + new_image_name | |
byte_data = uploaded_image.getvalue() | |
with open(image_save_path, "wb") as file: | |
file.write(byte_data) | |
# 10% completed | |
progress_bar.progress(10, text=progress_bar_message) | |
col_1, col_2 = st.columns([6, 4]) | |
with col_1: | |
st.image(uploaded_image, caption="Uploaded image.", use_column_width=True) | |
# 20% completed | |
progress_bar.progress(20, text=progress_bar_message) | |
scenario = image_to_text(image_save_path) | |
# 40% completed | |
progress_bar.progress(40, text=progress_bar_message) | |
script = generate_audio_script(openai_key, scenario) | |
# 60% completed | |
progress_bar.progress(60, text=progress_bar_message) | |
generate_audio(script) | |
# 90% completed | |
progress_bar.progress(90, text=progress_bar_message) | |
with col_2: | |
with st.expander("About the image"): | |
st.write(scenario) | |
with st.expander("Script"): | |
st.write(script) | |
st.audio("audio/bark_output.wav") | |
# 100% completed | |
progress_bar.progress( | |
100, text="Operation completed. Thank you for your patients." | |
) | |
if __name__ == "__main__": | |
main() | |