audioldm-text-to-audio-generation

Running

App Files Files Community

audioldm-text-to-audio-generation / app.py

kony1337

Update app.py

9a87ba4 over 1 year ago

raw

history blame

2.41 kB

	import gradio as gr
	import numpy as np
	from audioldm import text_to_audio, build_model
	from share_btn import community_icon_html, loading_icon_html, share_js

	model_id="haoheliu/AudioLDM-S-Full"

	audioldm = None
	current_model_name = None

	# def predict(input, history=[]):
	# # tokenize the new input sentence
	# new_user_input_ids = tokenizer.encode(input + tokenizer.eos_token, return_tensors='pt')

	# # append the new user input tokens to the chat history
	# bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)

	# # generate a response
	# history = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id).tolist()

	# # convert the tokens to text, and then split the responses into lines
	# response = tokenizer.decode(history[0]).split("<\|endoftext\|>")
	# response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
	# return response, history

	def text2audio(text, duration, guidance_scale, random_seed, n_candidates, model_name="audioldm-m-text-ft"):
	global audioldm, current_model_name

	if audioldm is None or model_name != current_model_name:
	audioldm=build_model(model_name=model_name)
	current_model_name = model_name

	# print(text, length, guidance_scale)
	waveform = text_to_audio(
	latent_diffusion=audioldm,
	text=text,
	seed=random_seed,
	duration=duration,
	guidance_scale=guidance_scale,
	n_candidate_gen_per_text=int(n_candidates),
	) # [bs, 1, samples]
	waveform = [
	gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform
	]
	# waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
	if(len(waveform) == 1):
	waveform = waveform[0]
	return waveform

	iface = gr.Interface(fn=text2audio, inputs=[
	gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
	gr.Slider(2.5, 10, value=5, step=2.5),
	gr.Slider(0, 5, value=2.5, step=0.5),
	gr.Number(value=42),
	gr.Number(value=3)
	], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")],
	allow_flagging="never"
	)
	iface.launch(share=False)

	#iface.queue(max_size=10).launch(debug=True)
	# iface.launch(debug=True, share=True)