Spaces:

tsi-org
/

tango

Paused

App Files Files Community

deepanway commited on Apr 30, 2023

Commit

37952eb

•

1 Parent(s): fa2eee3

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -14

app.py CHANGED Viewed

@@ -66,32 +66,32 @@ class Tango:
         else:
             return list(self.chunks(outputs, samples))
-# Initialize Tango model
-tango = Tango()
-def gradio_generate(prompt):
-    output_wave = tango.generate(prompt)
-    # Save the output_wave as a temporary WAV file
     output_filename = "temp_output.wav"
     wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
     return output_filename
-# Add the description text box
 description_text = '''
 TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
 '''
-# Define Gradio input and output components
 input_text = gr.inputs.Textbox(lines=2, label="Prompt")
 output_audio = gr.outputs.Audio(label="Generated Audio", type="filepath")
-# Create Gradio interface
 gr_interface = gr.Interface(
     fn=gradio_generate,
-    inputs=input_text,
     outputs=[output_audio],
     title="TANGO: Text to Audio using Instruction-Guided Diffusion",
     description="Generate audio using TANGO by providing a text prompt.",
@@ -99,16 +99,17 @@ gr_interface = gr.Interface(
     examples=[
         ["An audience cheering and clapping"],
         ["Rolling thunder with lightning strikes"],
         ["A car engine revving"],
         ["A dog barking"],
         ["A cat meowing"],
         ["Emergency sirens wailing"],
         ["Whistling with birds chirping"],
-        ["A dog barking and a man talking and a racing car passes by"],
         ["Motor vehicles are driving with loud engines and a person whistles"],
-        ["People cheering in a stadium while rolling thunder and lightning strikes"],
         ["A helicopter is in flight"],
-        ["A person snoring"]
     ],
     cache_examples=False,
 )

         else:
             return list(self.chunks(outputs, samples))
+# Initialize TANGO
+if torch.cuda.is_available():
+    tango = Tango()
+else:
+    tango = Tango(device="cpu")
+def gradio_generate(prompt, steps):
+    output_wave = tango.generate(prompt, int(steps))
     output_filename = "temp_output.wav"
     wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
     return output_filename
 description_text = '''
 TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
 '''
+# Gradio input and output components
 input_text = gr.inputs.Textbox(lines=2, label="Prompt")
 output_audio = gr.outputs.Audio(label="Generated Audio", type="filepath")
+denoising_steps = gr.Number(value=100, label="Steps", interactive=True, precision=0)
+# Gradio interface
 gr_interface = gr.Interface(
     fn=gradio_generate,
+    inputs=[input_text, denoising_steps],
     outputs=[output_audio],
     title="TANGO: Text to Audio using Instruction-Guided Diffusion",
     description="Generate audio using TANGO by providing a text prompt.",
     examples=[
         ["An audience cheering and clapping"],
         ["Rolling thunder with lightning strikes"],
+        ["Gentle water stream, birds chirping and sudden gun shot"]
         ["A car engine revving"],
         ["A dog barking"],
         ["A cat meowing"],
         ["Emergency sirens wailing"],
         ["Whistling with birds chirping"],
+        ["A person snoring"],
         ["Motor vehicles are driving with loud engines and a person whistles"],
+        ["People cheering in a stadium while thunder and lightning strikes"],
         ["A helicopter is in flight"],
+        ["A dog barking and a man talking and a racing car passes by"],
     ],
     cache_examples=False,
 )