fmcurti commited on
Commit
49b592c
1 Parent(s): 0088a18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -33,7 +33,7 @@ def synthesise(text):
33
  input_ids = inputs["input_ids"]
34
  with torch.no_grad():
35
  outputs = spanish_model(input_ids)
36
- speech = outputs.audio[0]
37
  return speech.cpu()
38
 
39
 
@@ -41,7 +41,7 @@ def synthesise(text):
41
  def speech_to_speech_translation(audio):
42
  translated_text = translate(audio)
43
  synthesised_speech = synthesise(translated_text)
44
- synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int32)
45
  return 16000, synthesised_speech
46
 
47
 
@@ -49,7 +49,7 @@ title = "Cascaded STST"
49
  description = """
50
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
51
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
52
- a
53
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
54
  """
55
 
@@ -75,4 +75,4 @@ file_translate = gr.Interface(
75
  with demo:
76
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
77
 
78
- demo.launch()
 
33
  input_ids = inputs["input_ids"]
34
  with torch.no_grad():
35
  outputs = spanish_model(input_ids)
36
+ speech = outputs.waveform
37
  return speech.cpu()
38
 
39
 
 
41
  def speech_to_speech_translation(audio):
42
  translated_text = translate(audio)
43
  synthesised_speech = synthesise(translated_text)
44
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
45
  return 16000, synthesised_speech
46
 
47
 
 
49
  description = """
50
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
51
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
52
+
53
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
54
  """
55
 
 
75
  with demo:
76
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
77
 
78
+ demo.launch()