speech-to-speech-translation

Runtime error

fmcurti commited on Oct 5, 2023

Commit

49b592c

•

1 Parent(s): 0088a18

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -33,7 +33,7 @@ def synthesise(text):
     input_ids = inputs["input_ids"]
     with torch.no_grad():
         outputs = spanish_model(input_ids)
-    speech = outputs.audio[0]
     return speech.cpu()
@@ -41,7 +41,7 @@ def synthesise(text):
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
-    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int32)
     return 16000, synthesised_speech
@@ -49,7 +49,7 @@ title = "Cascaded STST"
 description = """
 Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
 [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
-a
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
@@ -75,4 +75,4 @@ file_translate = gr.Interface(
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
-demo.launch()

     input_ids = inputs["input_ids"]
     with torch.no_grad():
         outputs = spanish_model(input_ids)
+    speech = outputs.waveform
     return speech.cpu()
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
+    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
 description = """
 Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
 [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
+demo.launch()