ayymen commited on
Commit
1f4acac
1 Parent(s): 64f6df8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -45
app.py CHANGED
@@ -6,9 +6,6 @@ import os
6
  import uuid
7
  import json
8
 
9
- import jieba
10
-
11
- import nemo.collections.asr as nemo_asr
12
  from nemo.collections.asr.models import ASRModel
13
  from nemo.utils import logging
14
 
@@ -17,17 +14,6 @@ from align import main, AlignmentConfig, ASSFileConfig
17
 
18
  SAMPLE_RATE = 16000
19
 
20
- # Pre-download and cache the model in disk space
21
- logging.setLevel(logging.ERROR)
22
- for tmp_model_name in [
23
- "stt_en_fastconformer_hybrid_large_pc",
24
- "stt_de_fastconformer_hybrid_large_pc",
25
- "stt_es_fastconformer_hybrid_large_pc",
26
- "stt_fr_conformer_ctc_large",
27
- "stt_zh_citrinet_1024_gamma_0_25",
28
- ]:
29
- tmp_model = ASRModel.from_pretrained(tmp_model_name, map_location='cpu')
30
- del tmp_model
31
  logging.setLevel(logging.INFO)
32
 
33
 
@@ -102,9 +88,7 @@ def delete_mp4s_except_given_filepath(filepath):
102
  os.remove(mp4_file)
103
 
104
 
105
-
106
-
107
- def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Progress()):
108
  # Create utt_id, specify output_video_filepath and delete any MP4s
109
  # that are not that filepath. These stray MP4s can be created
110
  # if a user refreshes or exits the page while this 'align' function is executing.
@@ -115,24 +99,15 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
115
  delete_mp4s_except_given_filepath(output_video_filepath)
116
 
117
  output_info = ""
 
118
 
119
  progress(0, desc="Validating input")
120
 
121
- # choose model
122
- if lang in ["en", "de", "es"]:
123
- model_name = f"stt_{lang}_fastconformer_hybrid_large_pc"
124
- elif lang in ["fr"]:
125
- model_name = f"stt_{lang}_conformer_ctc_large"
126
- elif lang in ["zh"]:
127
- model_name = f"stt_{lang}_citrinet_1024_gamma_0_25"
128
-
129
  # decide which of Mic / File_Upload is used as input & do error handling
130
  if (Microphone is not None) and (File_Upload is not None):
131
  raise gr.Error("Please use either the microphone or file upload input - not both")
132
-
133
  elif (Microphone is None) and (File_Upload is None):
134
  raise gr.Error("You have to either use the microphone or upload an audio file")
135
-
136
  elif Microphone is not None:
137
  file = Microphone
138
  else:
@@ -148,6 +123,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
148
 
149
  # loading model
150
  progress(0.1, desc="Loading speech recognition model")
 
151
  model = ASRModel.from_pretrained(model_name)
152
 
153
  if text: # check input text is not too long compared to audio
@@ -185,9 +161,9 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
185
  " transcription errors, and clicking 'Submit' again."
186
  )
187
 
188
- if lang == "zh" and " " not in text:
189
- # use jieba to add spaces between zh characters
190
- text = " ".join(jieba.cut(text))
191
 
192
  data = {
193
  "audio_filepath": audio_path,
@@ -213,7 +189,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
213
  additional_segment_grouping_separator="|",
214
  # transcribe_device='cpu',
215
  # viterbi_device='cpu',
216
- save_output_file_formats=["ass"],
217
  ass_file_config=ASSFileConfig(
218
  fontsize=45,
219
  resegment_text_to_fill_space=resegment_text_to_fill_space,
@@ -231,12 +207,11 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
231
  progress(0.95, desc="Saving generated alignments")
232
 
233
 
234
- if lang=="zh":
235
- # make video file from the token-level ASS file
236
- ass_file_for_video = f"{tmpdir}/nfa_output/ass/tokens/{utt_id}.ass"
237
- else:
238
- # make video file from the word-level ASS file
239
- ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
240
 
241
  ffmpeg_command = (
242
  f"ffmpeg -y -i {audio_path} "
@@ -248,7 +223,28 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
248
 
249
  os.system(ffmpeg_command)
250
 
251
- return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
 
254
  def delete_non_tmp_video(video_path):
@@ -273,14 +269,16 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
273
 
274
  with gr.Column(scale=1):
275
  gr.Markdown("## Input")
276
- lang_drop = gr.Dropdown(choices=["de", "en", "es", "fr", "zh"], value="en", label="Audio language",)
277
-
278
  mic_in = gr.Audio(sources=["microphone"], type='filepath', label="Microphone input (max 4 mins)")
279
  audio_file_in = gr.Audio(sources=["upload"], type='filepath', label="File upload (max 4 mins)")
280
  ref_text = gr.Textbox(
281
  label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
282
  "Leave this field blank to use an ASR model's transcription as the reference text instead."
283
  )
 
 
 
 
284
 
285
  gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
286
  with gr.Row():
@@ -292,8 +290,11 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
292
 
293
  with gr.Column(scale=1):
294
  gr.Markdown("## Output")
295
- video_out = gr.Video(label="output video")
296
- text_out = gr.Textbox(label="output info", visible=False)
 
 
 
297
 
298
  with gr.Row():
299
  gr.HTML(
@@ -306,12 +307,26 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
306
 
307
  submit_button.click(
308
  fn=align,
309
- inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,],
310
- outputs=[video_out, text_out, non_tmp_output_video_filepath],
311
  ).then(
312
  fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
313
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
  demo.queue()
316
  demo.launch()
317
-
 
6
  import uuid
7
  import json
8
 
 
 
 
9
  from nemo.collections.asr.models import ASRModel
10
  from nemo.utils import logging
11
 
 
14
 
15
  SAMPLE_RATE = 16000
16
 
 
 
 
 
 
 
 
 
 
 
 
17
  logging.setLevel(logging.INFO)
18
 
19
 
 
88
  os.remove(mp4_file)
89
 
90
 
91
+ def align(Microphone, File_Upload, text, col1, col2, col3, split_on_newline, progress=gr.Progress()):
 
 
92
  # Create utt_id, specify output_video_filepath and delete any MP4s
93
  # that are not that filepath. These stray MP4s can be created
94
  # if a user refreshes or exits the page while this 'align' function is executing.
 
99
  delete_mp4s_except_given_filepath(output_video_filepath)
100
 
101
  output_info = ""
102
+ ass_text=""
103
 
104
  progress(0, desc="Validating input")
105
 
 
 
 
 
 
 
 
 
106
  # decide which of Mic / File_Upload is used as input & do error handling
107
  if (Microphone is not None) and (File_Upload is not None):
108
  raise gr.Error("Please use either the microphone or file upload input - not both")
 
109
  elif (Microphone is None) and (File_Upload is None):
110
  raise gr.Error("You have to either use the microphone or upload an audio file")
 
111
  elif Microphone is not None:
112
  file = Microphone
113
  else:
 
123
 
124
  # loading model
125
  progress(0.1, desc="Loading speech recognition model")
126
+ model_name = "ayymen/stt_zgh_fastconformer_ctc_small"
127
  model = ASRModel.from_pretrained(model_name)
128
 
129
  if text: # check input text is not too long compared to audio
 
161
  " transcription errors, and clicking 'Submit' again."
162
  )
163
 
164
+ # split text on new lines if requested
165
+ if split_on_newline:
166
+ text = "|".join(list(filter(None, text.split("\n"))))
167
 
168
  data = {
169
  "audio_filepath": audio_path,
 
189
  additional_segment_grouping_separator="|",
190
  # transcribe_device='cpu',
191
  # viterbi_device='cpu',
192
+ save_output_file_formats=["ass", "ctm"],
193
  ass_file_config=ASSFileConfig(
194
  fontsize=45,
195
  resegment_text_to_fill_space=resegment_text_to_fill_space,
 
207
  progress(0.95, desc="Saving generated alignments")
208
 
209
 
210
+ # make video file from the word-level ASS file
211
+ ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
212
+
213
+ with open(ass_file_for_video, "r") as ass_file:
214
+ ass_text = ass_file.read()
 
215
 
216
  ffmpeg_command = (
217
  f"ffmpeg -y -i {audio_path} "
 
223
 
224
  os.system(ffmpeg_command)
225
 
226
+ # save ASS file
227
+ ass_path = "word_level.ass"
228
+ with open(ass_path, "w", encoding="utf-8") as f:
229
+ f.write(ass_text)
230
+
231
+ # save word-level CTM file
232
+ with open(f"{tmpdir}/nfa_output/ctm/words/{utt_id}.ctm", "r") as word_ctm_file:
233
+ word_ctm_text = word_ctm_file.read()
234
+
235
+ word_ctm_path = "word_level.ctm"
236
+ with open(word_ctm_path, "w", encoding="utf-8") as f:
237
+ f.write(word_ctm_text)
238
+
239
+ # save segment-level CTM file
240
+ with open(f"{tmpdir}/nfa_output/ctm/segments/{utt_id}.ctm", "r") as segment_ctm_file:
241
+ segment_ctm_text = segment_ctm_file.read()
242
+
243
+ segment_ctm_path = "segment_level.ctm"
244
+ with open(segment_ctm_path, "w", encoding="utf-8") as f:
245
+ f.write(segment_ctm_text)
246
+
247
+ return output_video_filepath, gr.update(value=output_info, visible=True if output_info else False), output_video_filepath, gr.update(value=ass_path, visible=True), gr.update(value=word_ctm_path, visible=True), gr.update(value=segment_ctm_path, visible=True)
248
 
249
 
250
  def delete_non_tmp_video(video_path):
 
269
 
270
  with gr.Column(scale=1):
271
  gr.Markdown("## Input")
 
 
272
  mic_in = gr.Audio(sources=["microphone"], type='filepath', label="Microphone input (max 4 mins)")
273
  audio_file_in = gr.Audio(sources=["upload"], type='filepath', label="File upload (max 4 mins)")
274
  ref_text = gr.Textbox(
275
  label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
276
  "Leave this field blank to use an ASR model's transcription as the reference text instead."
277
  )
278
+ split_on_newline = gr.Checkbox(
279
+ True,
280
+ label="Separate text on new lines",
281
+ )
282
 
283
  gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
284
  with gr.Row():
 
290
 
291
  with gr.Column(scale=1):
292
  gr.Markdown("## Output")
293
+ video_out = gr.Video(label="Output Video")
294
+ text_out = gr.Textbox(label="Output Info", visible=False)
295
+ ass_file = gr.File(label="ASS File", visible=False)
296
+ word_ctm_file = gr.File(label="Word-level CTM File", visible=False)
297
+ segment_ctm_file = gr.File(label="Segment-level CTM File", visible=False)
298
 
299
  with gr.Row():
300
  gr.HTML(
 
307
 
308
  submit_button.click(
309
  fn=align,
310
+ inputs=[mic_in, audio_file_in, ref_text, col1, col2, col3, split_on_newline],
311
+ outputs=[video_out, text_out, non_tmp_output_video_filepath, ass_file, word_ctm_file, segment_ctm_file],
312
  ).then(
313
  fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
314
  )
315
+ example_2 = """ⵜⴰⴽⵟⵟⵓⵎⵜ ⵏ ⵜⵙⴰⴷⵓⴼⵜ.
316
+ ⵙ ⵉⵙⵎ ⵏ ⵕⴱⴱⵉ ⴰⵎⴰⵍⵍⴰⵢ ⴰⵎⵙⵎⵓⵍⵍⵓ.
317
+ ⴰⵎⵓⵢ ⵉ ⵕⴱⴱⵉ ⵍⵍⵉ ⵎⵓ ⵜⴳⴰ ⵜⵓⵍⵖⵉⵜ ⵜⵉⵏⵏⵙ, ⵕⴱⴱⵉ ⵏ ⵉⵖⵥⵡⴰⵕⵏ, ⴽⵔⴰ ⴳⴰⵏ.
318
+ ⴰⵎⴰⵍⵍⴰⵢ ⴰⵎⵙⵎⵓⵍⵍⵓ, ⵖ ⵜⵎⵣⵡⴰⵔⵓⵜ ⵓⵍⴰ ⵖ ⵜⵎⴳⴳⴰⵔⵓⵜ.
319
+ ⴰⴳⵍⵍⵉⴷ ⵏ ⵡⴰⵙⵙ ⵏ ⵓⴼⵔⴰ, ⴰⵙⵙ ⵏ ⵓⵙⵙⵃⵙⵓ, ⴽⵔⴰⵉⴳⴰⵜ ⵢⴰⵏ ⴷ ⵎⴰⴷ ⵉⵙⴽⵔ.
320
+ ⵀⴰ ⵏⵏ ⴽⵢⵢⵉ ⴽⴰ ⵙ ⵏⵙⵙⵓⵎⴷ, ⴷ ⴽⵢⵢⵉ ⴽⴰ ⴰⴷ ⵏⵎⵎⵜⵔ.
321
+ ⵙⵎⵓⵏ ⴰⵖ, ⵜⵎⵍⵜ ⴰⵖ, ⴰⵖⴰⵔⴰⵙ ⵢⵓⵖⴷⵏ.
322
+ ⴰⵖⴰⵔⴰⵙ ⵏ ⵖⵡⵉⵍⵍⵉ ⵜⵙⵏⵏⵓⴼⴰⵜ, ⵓⵔ ⴷ ⴰⵢⵜ ⵜⵉⵢⵓⵔⵉ, ⵓⵍⴰ ⵉⵎⵓⴹⴹⴰⵕ."""
323
+ examples = gr.Examples(
324
+ examples=[
325
+ ["common_voice_zgh_37837257.mp3", "ⵎⵍ ⵉⵢⵉ ⵎⴰⴷ ⴷ ⵜⴻⵜⵜⵎⵓⵏⴷ ⴰⴷ ⴰⴽ ⵎⵍⵖ ⵎⴰⴷ ⵜⴳⵉⴷ"],
326
+ ["Voice1410.wav", example_2]
327
+ ],
328
+ inputs=[audio_file_in, ref_text]
329
+ )
330
 
331
  demo.queue()
332
  demo.launch()