Mark Duppenthaler commited on
Commit
1727d3b
1 Parent(s): a7361bc

work with m4t model

Browse files
Files changed (2) hide show
  1. app.py +125 -57
  2. m4t_app.py +463 -0
app.py CHANGED
@@ -11,73 +11,141 @@ from seamless_communication.models.inference.translator import Translator
11
 
12
  from m4t_app import *
13
 
14
- from transformers import pipeline
15
-
16
- p = pipeline("automatic-speech-recognition")
17
-
18
  from pydub import AudioSegment
19
  import time
20
  from time import sleep
21
 
22
- m4t_demo()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- def transcribe(audio, state=""):
25
- # sleep(2)
26
- print('state', state)
27
- text = p(audio)["text"]
28
- state += text + " "
29
- return state
30
 
31
  def blocks():
32
  with gr.Blocks() as demo:
33
- total_audio_bytes_state = gr.State(bytes())
34
- total_text_state = gr.State("")
35
 
36
  # input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
37
- input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3", source="microphone", streaming=True)
38
- with gr.Row():
39
- with gr.Column():
40
- stream_as_bytes_btn = gr.Button("Stream as Bytes")
41
- stream_as_bytes_output = gr.Audio(format="bytes", streaming=True)
42
- stream_output_text = gr.Textbox(label="Translated text")
43
-
44
-
45
- def stream_bytes(audio_file, total_audio_bytes_state, total_text_state):
46
- chunk_size = 30000
47
-
48
- print(f"audio_file {audio_file}, size {os.path.getsize(audio_file)}")
49
- with open(audio_file, "rb") as f:
50
-
51
- while True:
52
- chunk = f.read(chunk_size)
53
- if chunk:
54
- total_audio_bytes_state += chunk
55
- print('yielding chunk', len(chunk))
56
- print('total audio bytes', len(total_audio_bytes_state))
57
- print(f"Text state: {total_text_state}")
58
-
59
- # This does the whole thing every time
60
- # total_text = transcribe(chunk, "")
61
- # yield total_audio_bytes_state, total_text, total_audio_bytes_state, total_text_state
62
-
63
- # This translates just the new part every time
64
- total_text_state = transcribe(chunk, total_text_state)
65
- total_text = total_text_state
66
- # total_text = transcribe(chunk, total_text)
67
- yield total_audio_bytes_state, total_text, total_audio_bytes_state, total_text_state
68
- # sleep(3)
69
- else:
70
- break
71
- def clear():
72
- print('clearing')
73
- return [bytes(), ""]
74
-
75
- stream_as_bytes_btn.click(stream_bytes, [input_audio, total_audio_bytes_state, total_text_state], [stream_as_bytes_output, stream_output_text, total_audio_bytes_state, total_text_state])
76
-
77
- input_audio.change(stream_bytes, [input_audio, total_audio_bytes_state, total_text_state], [stream_as_bytes_output, stream_output_text, total_audio_bytes_state, total_text_state])
78
- input_audio.clear(clear, None, [total_audio_bytes_state, total_text_state])
79
- input_audio.start_recording(clear, None, [total_audio_bytes_state, total_text_state])
80
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  demo.queue().launch()
83
 
 
11
 
12
  from m4t_app import *
13
 
 
 
 
 
14
  from pydub import AudioSegment
15
  import time
16
  from time import sleep
17
 
18
+ # m4t_demo()
19
+
20
+ USE_M4T = True
21
+
22
+
23
+ def translate_audio_file_segment(audio_file):
24
+ print("translate_m4t state")
25
+
26
+ return predict(
27
+ task_name="S2ST",
28
+ audio_source="microphone",
29
+ input_audio_mic=audio_file,
30
+ input_audio_file=None,
31
+ input_text="",
32
+ source_language="English",
33
+ target_language="Portuguese",
34
+ )
35
+
36
+
37
+ def translate_m4t_callback(
38
+ audio_file, translated_audio_bytes_state, translated_text_state
39
+ ):
40
+ translated_wav_segment, translated_text = translate_audio_file_segment(audio_file)
41
+ print('translated_audio_bytes_state', translated_audio_bytes_state)
42
+ print('translated_wav_segment', translated_wav_segment)
43
+
44
+ # combine translated wav into larger..
45
+ if type(translated_audio_bytes_state) is not tuple:
46
+ translated_audio_bytes_state = translated_wav_segment
47
+ else:
48
+
49
+ translated_audio_bytes_state = (translated_audio_bytes_state[0], np.append(translated_audio_bytes_state[1], translated_wav_segment[1]))
50
+
51
+ # translated_wav_segment[1]
52
+
53
+
54
+ translated_text_state += " | " + str(translated_text)
55
+ return [
56
+ audio_file,
57
+ translated_wav_segment,
58
+ translated_audio_bytes_state,
59
+ translated_text_state,
60
+ translated_audio_bytes_state,
61
+ translated_text_state,
62
+ ]
63
+
64
+
65
+ def clear():
66
+ print("Clearing State")
67
+ return [bytes(), ""]
68
 
 
 
 
 
 
 
69
 
70
  def blocks():
71
  with gr.Blocks() as demo:
72
+ translated_audio_bytes_state = gr.State(None)
73
+ translated_text_state = gr.State("")
74
 
75
  # input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
76
+ if USE_M4T:
77
+ input_audio = gr.Audio(
78
+ label="Input Audio",
79
+ type="filepath",
80
+ source="microphone",
81
+ streaming=True,
82
+ )
83
+ else:
84
+ input_audio = gr.Audio(
85
+ label="Input Audio",
86
+ type="filepath",
87
+ format="mp3",
88
+ source="microphone",
89
+ streaming=True,
90
+ )
91
+
92
+ most_recent_input_audio_segment = gr.Audio(
93
+ label="Recent Input Audio Segment segments", format="bytes", streaming=True
94
+ )
95
+ # TODO: Should add combined input audio segments...
96
+
97
+ stream_as_bytes_btn = gr.Button("Translate most recent recording segment")
98
+
99
+ output_translation_segment = gr.Audio(
100
+ label="Translated audio segment",
101
+ autoplay=False,
102
+ streaming=True,
103
+ type="numpy",
104
+ )
105
+
106
+ output_translation_combined = gr.Audio(
107
+ label="Translated audio combined",
108
+ autoplay=False,
109
+ streaming=True,
110
+ type="numpy",
111
+ )
112
+
113
+ # Could add output text segment
114
+ stream_output_text = gr.Textbox(label="Translated text")
115
+
116
+ stream_as_bytes_btn.click(
117
+ translate_m4t_callback,
118
+ [input_audio, translated_audio_bytes_state, translated_text_state],
119
+ [
120
+ most_recent_input_audio_segment,
121
+ output_translation_segment,
122
+ output_translation_combined,
123
+ stream_output_text,
124
+ translated_audio_bytes_state,
125
+ translated_text_state,
126
+ ],
127
+ )
128
+
129
+ input_audio.change(
130
+ translate_m4t_callback,
131
+ [input_audio, translated_audio_bytes_state, translated_text_state],
132
+ [
133
+ most_recent_input_audio_segment,
134
+ output_translation_segment,
135
+ output_translation_combined,
136
+ stream_output_text,
137
+ translated_audio_bytes_state,
138
+ translated_text_state,
139
+ ],
140
+ )
141
+ # input_audio.change(stream_bytes, [input_audio, translated_audio_bytes_state, translated_text_state], [most_recent_input_audio_segment, stream_output_text, translated_audio_bytes_state, translated_text_state])
142
+ # input_audio.change(lambda input_audio: recorded_audio, [input_audio], [recorded_audio])
143
+ input_audio.clear(
144
+ clear, None, [translated_audio_bytes_state, translated_text_state]
145
+ )
146
+ input_audio.start_recording(
147
+ clear, None, [translated_audio_bytes_state, translated_text_state]
148
+ )
149
 
150
  demo.queue().launch()
151
 
m4t_app.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ import gradio as gr
6
+ import numpy as np
7
+ import torch
8
+ import torchaudio
9
+ from seamless_communication.models.inference.translator import Translator
10
+
11
+ from lang_list import (
12
+ LANGUAGE_NAME_TO_CODE,
13
+ S2ST_TARGET_LANGUAGE_NAMES,
14
+ S2TT_TARGET_LANGUAGE_NAMES,
15
+ T2TT_TARGET_LANGUAGE_NAMES,
16
+ TEXT_SOURCE_LANGUAGE_NAMES,
17
+ )
18
+
19
+ DESCRIPTION = """# SeamlessM4T
20
+
21
+ # mduppes aaaaaa
22
+
23
+ [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
24
+ translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
25
+
26
+ This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
27
+ translation and more, without relying on multiple separate models.
28
+ """
29
+
30
+ CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1"
31
+
32
+ TASK_NAMES = [
33
+ "S2ST (Speech to Speech translation)",
34
+ "S2TT (Speech to Text translation)",
35
+ "T2ST (Text to Speech translation)",
36
+ "T2TT (Text to Text translation)",
37
+ "ASR (Automatic Speech Recognition)",
38
+ ]
39
+ AUDIO_SAMPLE_RATE = 16000.0
40
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
41
+ DEFAULT_TARGET_LANGUAGE = "French"
42
+
43
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
44
+ print("DEVICE", device)
45
+ translator = Translator(
46
+ model_name_or_card="seamlessM4T_medium",
47
+ vocoder_name_or_card="vocoder_36langs",
48
+ device=device,
49
+ # dtype=torch.float16,
50
+ # For CPU Mode need to use 32, float16 causes errors downstream
51
+ dtype=torch.float32,
52
+ )
53
+
54
+ def get_translator():
55
+ return translator
56
+
57
+
58
+ def transcribe(audio):
59
+ print(audio)
60
+ text = p(audio)["text"]
61
+ return text
62
+
63
+ def transcribe_state(audio, state = ""):
64
+ print(audio)
65
+ text = p(audio)["text"]
66
+ state += text + " "
67
+ return state, state
68
+
69
+
70
+ def predict(
71
+ task_name: str,
72
+ audio_source: str,
73
+ input_audio_mic: str | None,
74
+ input_audio_file: str | None,
75
+ input_text: str | None,
76
+ source_language: str | None,
77
+ target_language: str,
78
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
79
+ task_name = task_name.split()[0]
80
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
81
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
82
+
83
+ if task_name in ["S2ST", "S2TT", "ASR"]:
84
+ if audio_source == "microphone":
85
+ input_data = input_audio_mic
86
+ else:
87
+ input_data = input_audio_file
88
+
89
+ arr, org_sr = torchaudio.load(input_data)
90
+ print(task_name, audio_source, input_audio_mic, type(input_audio_file), type(input_text), source_language, target_language)
91
+ new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
92
+ max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
93
+ if new_arr.shape[1] > max_length:
94
+ new_arr = new_arr[:, :max_length]
95
+ gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
96
+ torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
97
+ else:
98
+ input_data = input_text
99
+ text_out, wav, sr = translator.predict(
100
+ input=input_data,
101
+ task_str=task_name,
102
+ tgt_lang=target_language_code,
103
+ src_lang=source_language_code,
104
+ ngram_filtering=True,
105
+ sample_rate=AUDIO_SAMPLE_RATE,
106
+ )
107
+ print("translation response", text_out, wav, sr)
108
+ # text_out = "Testing"
109
+ # return None, text_out
110
+ if task_name in ["S2ST", "T2ST"]:
111
+ return (sr, wav.cpu().detach().numpy()), text_out
112
+ else:
113
+ return None, text_out
114
+
115
+
116
+ def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
117
+ return predict(
118
+ task_name="S2ST",
119
+ audio_source="file",
120
+ input_audio_mic=None,
121
+ input_audio_file=input_audio_file,
122
+ input_text=None,
123
+ source_language=None,
124
+ target_language=target_language,
125
+ )
126
+
127
+
128
+ def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
129
+ return predict(
130
+ task_name="S2TT",
131
+ audio_source="file",
132
+ input_audio_mic=None,
133
+ input_audio_file=input_audio_file,
134
+ input_text=None,
135
+ source_language=None,
136
+ target_language=target_language,
137
+ )
138
+
139
+
140
+ def process_t2st_example(
141
+ input_text: str, source_language: str, target_language: str
142
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
143
+ return predict(
144
+ task_name="T2ST",
145
+ audio_source="",
146
+ input_audio_mic=None,
147
+ input_audio_file=None,
148
+ input_text=input_text,
149
+ source_language=source_language,
150
+ target_language=target_language,
151
+ )
152
+
153
+
154
+ def process_t2tt_example(
155
+ input_text: str, source_language: str, target_language: str
156
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
157
+ return predict(
158
+ task_name="T2TT",
159
+ audio_source="",
160
+ input_audio_mic=None,
161
+ input_audio_file=None,
162
+ input_text=input_text,
163
+ source_language=source_language,
164
+ target_language=target_language,
165
+ )
166
+
167
+
168
+ def process_asr_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
169
+ return predict(
170
+ task_name="ASR",
171
+ audio_source="file",
172
+ input_audio_mic=None,
173
+ input_audio_file=input_audio_file,
174
+ input_text=None,
175
+ source_language=None,
176
+ target_language=target_language,
177
+ )
178
+
179
+
180
+ def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
181
+ mic = audio_source == "microphone"
182
+ return (
183
+ gr.update(visible=mic, value=None), # input_audio_mic
184
+ gr.update(visible=not mic, value=None), # input_audio_file
185
+ )
186
+
187
+
188
+ def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
189
+ task_name = task_name.split()[0]
190
+ if task_name == "S2ST":
191
+ return (
192
+ gr.update(visible=True), # audio_box
193
+ gr.update(visible=False), # input_text
194
+ gr.update(visible=False), # source_language
195
+ gr.update(
196
+ visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
197
+ ), # target_language
198
+ )
199
+ elif task_name == "S2TT":
200
+ return (
201
+ gr.update(visible=True), # audio_box
202
+ gr.update(visible=False), # input_text
203
+ gr.update(visible=False), # source_language
204
+ gr.update(
205
+ visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
206
+ ), # target_language
207
+ )
208
+ elif task_name == "T2ST":
209
+ return (
210
+ gr.update(visible=False), # audio_box
211
+ gr.update(visible=True), # input_text
212
+ gr.update(visible=True), # source_language
213
+ gr.update(
214
+ visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
215
+ ), # target_language
216
+ )
217
+ elif task_name == "T2TT":
218
+ return (
219
+ gr.update(visible=False), # audio_box
220
+ gr.update(visible=True), # input_text
221
+ gr.update(visible=True), # source_language
222
+ gr.update(
223
+ visible=True, choices=T2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
224
+ ), # target_language
225
+ )
226
+ elif task_name == "ASR":
227
+ return (
228
+ gr.update(visible=True), # audio_box
229
+ gr.update(visible=False), # input_text
230
+ gr.update(visible=False), # source_language
231
+ gr.update(
232
+ visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
233
+ ), # target_language
234
+ )
235
+ else:
236
+ raise ValueError(f"Unknown task: {task_name}")
237
+
238
+
239
+ def update_output_ui(task_name: str) -> tuple[dict, dict]:
240
+ task_name = task_name.split()[0]
241
+ if task_name in ["S2ST", "T2ST"]:
242
+ return (
243
+ gr.update(visible=True, value=None), # output_audio
244
+ gr.update(value=None), # output_text
245
+ )
246
+ elif task_name in ["S2TT", "T2TT", "ASR"]:
247
+ return (
248
+ gr.update(visible=False, value=None), # output_audio
249
+ gr.update(value=None), # output_text
250
+ )
251
+ else:
252
+ raise ValueError(f"Unknown task: {task_name}")
253
+
254
+
255
+ def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]:
256
+ task_name = task_name.split()[0]
257
+ return (
258
+ gr.update(visible=task_name == "S2ST"), # s2st_example_row
259
+ gr.update(visible=task_name == "S2TT"), # s2tt_example_row
260
+ gr.update(visible=task_name == "T2ST"), # t2st_example_row
261
+ gr.update(visible=task_name == "T2TT"), # t2tt_example_row
262
+ gr.update(visible=task_name == "ASR"), # asr_example_row
263
+ )
264
+
265
+ def m4t_demo():
266
+
267
+ with gr.Blocks(css="style.css") as demo:
268
+ gr.Markdown(DESCRIPTION)
269
+ gr.DuplicateButton(
270
+ value="Duplicate Space for private use",
271
+ elem_id="duplicate-button",
272
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
273
+ )
274
+
275
+ with gr.Group():
276
+ task_name = gr.Dropdown(
277
+ label="Task",
278
+ choices=TASK_NAMES,
279
+ value=TASK_NAMES[0],
280
+ )
281
+
282
+
283
+ with gr.Row():
284
+ source_language = gr.Dropdown(
285
+ label="Source language",
286
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
287
+ value="English",
288
+ visible=False,
289
+ )
290
+ target_language = gr.Dropdown(
291
+ label="Target language",
292
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
293
+ value=DEFAULT_TARGET_LANGUAGE,
294
+ )
295
+ with gr.Row() as audio_box:
296
+ audio_source = gr.Radio(
297
+ label="Audio source",
298
+ choices=["file", "microphone"],
299
+ value="file",
300
+ )
301
+ input_audio_mic = gr.Audio(
302
+ label="Input speech",
303
+ type="filepath",
304
+ source="microphone",
305
+ visible=False,
306
+ )
307
+ input_audio_file = gr.Audio(
308
+ label="Input speech",
309
+ type="filepath",
310
+ source="upload",
311
+ visible=True,
312
+ )
313
+ input_text = gr.Textbox(label="Input text", visible=False)
314
+ btn = gr.Button("Translate")
315
+ with gr.Column():
316
+ output_audio = gr.Audio(
317
+ label="Translated speech",
318
+ autoplay=False,
319
+ streaming=False,
320
+ type="numpy",
321
+ )
322
+ output_text = gr.Textbox(label="Translated text")
323
+
324
+ with gr.Row(visible=True) as s2st_example_row:
325
+ s2st_examples = gr.Examples(
326
+ examples=[
327
+ ["assets/sample_input.mp3", "French"],
328
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
329
+ ["assets/sample_input_2.mp3", "Hindi"],
330
+ ["assets/sample_input_2.mp3", "Spanish"],
331
+ ],
332
+ inputs=[input_audio_file, target_language],
333
+ outputs=[output_audio, output_text],
334
+ fn=process_s2st_example,
335
+ cache_examples=CACHE_EXAMPLES,
336
+ )
337
+ with gr.Row(visible=False) as s2tt_example_row:
338
+ s2tt_examples = gr.Examples(
339
+ examples=[
340
+ ["assets/sample_input.mp3", "French"],
341
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
342
+ ["assets/sample_input_2.mp3", "Hindi"],
343
+ ["assets/sample_input_2.mp3", "Spanish"],
344
+ ],
345
+ inputs=[input_audio_file, target_language],
346
+ outputs=[output_audio, output_text],
347
+ fn=process_s2tt_example,
348
+ cache_examples=CACHE_EXAMPLES,
349
+ )
350
+ with gr.Row(visible=False) as t2st_example_row:
351
+ t2st_examples = gr.Examples(
352
+ examples=[
353
+ ["My favorite animal is the elephant.", "English", "French"],
354
+ ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
355
+ [
356
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
357
+ "English",
358
+ "Hindi",
359
+ ],
360
+ [
361
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
362
+ "English",
363
+ "Spanish",
364
+ ],
365
+ ],
366
+ inputs=[input_text, source_language, target_language],
367
+ outputs=[output_audio, output_text],
368
+ fn=process_t2st_example,
369
+ cache_examples=CACHE_EXAMPLES,
370
+ )
371
+ with gr.Row(visible=False) as t2tt_example_row:
372
+ t2tt_examples = gr.Examples(
373
+ examples=[
374
+ ["My favorite animal is the elephant.", "English", "French"],
375
+ ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
376
+ [
377
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
378
+ "English",
379
+ "Hindi",
380
+ ],
381
+ [
382
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
383
+ "English",
384
+ "Spanish",
385
+ ],
386
+ ],
387
+ inputs=[input_text, source_language, target_language],
388
+ outputs=[output_audio, output_text],
389
+ fn=process_t2tt_example,
390
+ cache_examples=CACHE_EXAMPLES,
391
+ )
392
+ with gr.Row(visible=False) as asr_example_row:
393
+ asr_examples = gr.Examples(
394
+ examples=[
395
+ ["assets/sample_input.mp3", "English"],
396
+ ["assets/sample_input_2.mp3", "English"],
397
+ ],
398
+ inputs=[input_audio_file, target_language],
399
+ outputs=[output_audio, output_text],
400
+ fn=process_asr_example,
401
+ cache_examples=CACHE_EXAMPLES,
402
+ )
403
+
404
+ audio_source.change(
405
+ fn=update_audio_ui,
406
+ inputs=audio_source,
407
+ outputs=[
408
+ input_audio_mic,
409
+ input_audio_file,
410
+ ],
411
+ queue=False,
412
+ api_name=False,
413
+ )
414
+ task_name.change(
415
+ fn=update_input_ui,
416
+ inputs=task_name,
417
+ outputs=[
418
+ audio_box,
419
+ input_text,
420
+ source_language,
421
+ target_language,
422
+ ],
423
+ queue=False,
424
+ api_name=False,
425
+ ).then(
426
+ fn=update_output_ui,
427
+ inputs=task_name,
428
+ outputs=[output_audio, output_text],
429
+ queue=False,
430
+ api_name=False,
431
+ ).then(
432
+ fn=update_example_ui,
433
+ inputs=task_name,
434
+ outputs=[
435
+ s2st_example_row,
436
+ s2tt_example_row,
437
+ t2st_example_row,
438
+ t2tt_example_row,
439
+ asr_example_row,
440
+ ],
441
+ queue=False,
442
+ api_name=False,
443
+ )
444
+
445
+ btn.click(
446
+ fn=predict,
447
+ inputs=[
448
+ task_name,
449
+ audio_source,
450
+ input_audio_mic,
451
+ input_audio_file,
452
+ input_text,
453
+ source_language,
454
+ target_language,
455
+ ],
456
+ outputs=[output_audio, output_text],
457
+ api_name="run",
458
+ )
459
+ demo.queue(max_size=50).launch()
460
+
461
+ # Linking models to the space
462
+ # 'facebook/seamless-m4t-large'
463
+ # 'facebook/SONAR'