|
import gradio as gr |
|
import numpy as np |
|
from audioldm import text_to_audio, build_model |
|
from share_btn import community_icon_html, loading_icon_html, share_js |
|
|
|
model_id="haoheliu/AudioLDM-S-Full" |
|
|
|
audioldm = None |
|
current_model_name = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def text2audio(text, duration, guidance_scale, random_seed, n_candidates, model_name="audioldm-m-text-ft"): |
|
global audioldm, current_model_name |
|
|
|
if audioldm is None or model_name != current_model_name: |
|
audioldm=build_model(model_name=model_name) |
|
current_model_name = model_name |
|
|
|
|
|
waveform = text_to_audio( |
|
latent_diffusion=audioldm, |
|
text=text, |
|
seed=random_seed, |
|
duration=duration, |
|
guidance_scale=guidance_scale, |
|
n_candidate_gen_per_text=int(n_candidates), |
|
) |
|
waveform = [ |
|
gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform |
|
] |
|
|
|
if(len(waveform) == 1): |
|
waveform = waveform[0] |
|
return waveform |
|
|
|
iface = gr.Interface(fn=text2audio, inputs=[ |
|
gr.Textbox(value="A man is speaking in a huge room", max_lines=1), |
|
gr.Slider(2.5, 10, value=5, step=2.5), |
|
gr.Slider(0, 5, value=2.5, step=0.5), |
|
gr.Number(value=42), |
|
gr.Number(value=3) |
|
], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")], |
|
allow_flagging="never" |
|
) |
|
iface.launch(share=False) |
|
|
|
|
|
|
|
|