NeMo-Forced-Aligner

Running

App Files Files Community

ayymen commited on 3 days ago

Commit

1f4acac

•

1 Parent(s): 64f6df8

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -45

app.py CHANGED Viewed

@@ -6,9 +6,6 @@ import os
 import uuid
 import json
-import jieba
-import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.models import ASRModel
 from nemo.utils import logging
@@ -17,17 +14,6 @@ from align import main, AlignmentConfig, ASSFileConfig
 SAMPLE_RATE = 16000
-# Pre-download and cache the model in disk space
-logging.setLevel(logging.ERROR)
-for tmp_model_name in [
-	"stt_en_fastconformer_hybrid_large_pc",
-	"stt_de_fastconformer_hybrid_large_pc",
-	"stt_es_fastconformer_hybrid_large_pc",
-	"stt_fr_conformer_ctc_large",
-	"stt_zh_citrinet_1024_gamma_0_25",
-]:
-	tmp_model = ASRModel.from_pretrained(tmp_model_name, map_location='cpu')
-	del tmp_model
 logging.setLevel(logging.INFO)
@@ -102,9 +88,7 @@ def delete_mp4s_except_given_filepath(filepath):
 			os.remove(mp4_file)
-def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Progress()):
 	# Create utt_id,  specify output_video_filepath and delete any MP4s
 	# that are not that filepath. These stray MP4s can be created
 	# if a user refreshes or exits the page while this 'align' function is executing.
@@ -115,24 +99,15 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 	delete_mp4s_except_given_filepath(output_video_filepath)
 	output_info = ""
 	progress(0, desc="Validating input")
-	# choose model
-	if lang in ["en", "de", "es"]:
-		model_name = f"stt_{lang}_fastconformer_hybrid_large_pc"
-	elif lang in ["fr"]:
-		model_name = f"stt_{lang}_conformer_ctc_large"
-	elif lang in ["zh"]:
-		model_name = f"stt_{lang}_citrinet_1024_gamma_0_25"
 	# decide which of Mic / File_Upload is used as input & do error handling
 	if (Microphone is not None) and (File_Upload is not None):
 		raise gr.Error("Please use either the microphone or file upload input - not both")
 	elif (Microphone is None) and (File_Upload is None):
 		raise gr.Error("You have to either use the microphone or upload an audio file")
 	elif Microphone is not None:
 		file = Microphone
 	else:
@@ -148,6 +123,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 	# loading model
 	progress(0.1, desc="Loading speech recognition model")
 	model = ASRModel.from_pretrained(model_name)
 	if text:  # check input text is not too long compared to audio
@@ -185,9 +161,9 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 				" transcription errors, and clicking 'Submit' again."
 			)
-		if lang == "zh" and " " not in text:
-			# use jieba to add spaces between zh characters
-			text = " ".join(jieba.cut(text))
 		data = {
 			"audio_filepath": audio_path,
@@ -213,7 +189,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 			additional_segment_grouping_separator="|",
 			# transcribe_device='cpu',
 			# viterbi_device='cpu',
-			save_output_file_formats=["ass"],
 			ass_file_config=ASSFileConfig(
 				fontsize=45,
 				resegment_text_to_fill_space=resegment_text_to_fill_space,
@@ -231,12 +207,11 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 		progress(0.95, desc="Saving generated alignments")
-		if lang=="zh":
-			# make video file from the token-level ASS file
-			ass_file_for_video = f"{tmpdir}/nfa_output/ass/tokens/{utt_id}.ass"
-		else:
-			# make video file from the word-level ASS file
-			ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
 		ffmpeg_command = (
 			f"ffmpeg -y -i {audio_path} "
@@ -248,7 +223,28 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 		os.system(ffmpeg_command)
-	return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath
 def delete_non_tmp_video(video_path):
@@ -273,14 +269,16 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
 		with gr.Column(scale=1):
 			gr.Markdown("## Input")
-			lang_drop = gr.Dropdown(choices=["de", "en", "es", "fr", "zh"], value="en", label="Audio language",)
 			mic_in = gr.Audio(sources=["microphone"], type='filepath', label="Microphone input (max 4 mins)")
 			audio_file_in = gr.Audio(sources=["upload"], type='filepath', label="File upload (max 4 mins)")
 			ref_text = gr.Textbox(
 				label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
 				"Leave this field blank to use an ASR model's transcription as the reference text instead."
 			)
 			gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
 			with gr.Row():
@@ -292,8 +290,11 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
 		with gr.Column(scale=1):
 			gr.Markdown("## Output")
-			video_out = gr.Video(label="output video")
-			text_out = gr.Textbox(label="output info", visible=False)
 	with gr.Row():
 		gr.HTML(
@@ -306,12 +307,26 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
 	submit_button.click(
 		fn=align,
-		inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,],
-		outputs=[video_out, text_out, non_tmp_output_video_filepath],
 	).then(
 		fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
 	)
 demo.queue()
 demo.launch()

 import uuid
 import json
 from nemo.collections.asr.models import ASRModel
 from nemo.utils import logging
 SAMPLE_RATE = 16000
 logging.setLevel(logging.INFO)
 			os.remove(mp4_file)
+def align(Microphone, File_Upload, text, col1, col2, col3, split_on_newline, progress=gr.Progress()):
 	# Create utt_id,  specify output_video_filepath and delete any MP4s
 	# that are not that filepath. These stray MP4s can be created
 	# if a user refreshes or exits the page while this 'align' function is executing.
 	delete_mp4s_except_given_filepath(output_video_filepath)
 	output_info = ""
+	ass_text=""
 	progress(0, desc="Validating input")
 	# decide which of Mic / File_Upload is used as input & do error handling
 	if (Microphone is not None) and (File_Upload is not None):
 		raise gr.Error("Please use either the microphone or file upload input - not both")
 	elif (Microphone is None) and (File_Upload is None):
 		raise gr.Error("You have to either use the microphone or upload an audio file")
 	elif Microphone is not None:
 		file = Microphone
 	else:
 	# loading model
 	progress(0.1, desc="Loading speech recognition model")
+	model_name = "ayymen/stt_zgh_fastconformer_ctc_small"
 	model = ASRModel.from_pretrained(model_name)
 	if text:  # check input text is not too long compared to audio
 				" transcription errors, and clicking 'Submit' again."
 			)
+        # split text on new lines if requested
+		if split_on_newline:
+			text = "|".join(list(filter(None, text.split("\n"))))
 		data = {
 			"audio_filepath": audio_path,
 			additional_segment_grouping_separator="|",
 			# transcribe_device='cpu',
 			# viterbi_device='cpu',
+			save_output_file_formats=["ass", "ctm"],
 			ass_file_config=ASSFileConfig(
 				fontsize=45,
 				resegment_text_to_fill_space=resegment_text_to_fill_space,
 		progress(0.95, desc="Saving generated alignments")
+		# make video file from the word-level ASS file
+		ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
+		with open(ass_file_for_video, "r") as ass_file:
+			ass_text = ass_file.read()
 		ffmpeg_command = (
 			f"ffmpeg -y -i {audio_path} "
 		os.system(ffmpeg_command)
+		# save ASS file
+		ass_path = "word_level.ass"
+		with open(ass_path, "w", encoding="utf-8") as f:
+			f.write(ass_text)
+		# save word-level CTM file
+		with open(f"{tmpdir}/nfa_output/ctm/words/{utt_id}.ctm", "r") as word_ctm_file:
+			word_ctm_text = word_ctm_file.read()
+		word_ctm_path = "word_level.ctm"
+		with open(word_ctm_path, "w", encoding="utf-8") as f:
+			f.write(word_ctm_text)
+		# save segment-level CTM file
+		with open(f"{tmpdir}/nfa_output/ctm/segments/{utt_id}.ctm", "r") as segment_ctm_file:
+			segment_ctm_text = segment_ctm_file.read()
+		segment_ctm_path = "segment_level.ctm"
+		with open(segment_ctm_path, "w", encoding="utf-8") as f:
+			f.write(segment_ctm_text)
+	return output_video_filepath, gr.update(value=output_info, visible=True if output_info else False), output_video_filepath, gr.update(value=ass_path, visible=True), gr.update(value=word_ctm_path, visible=True), gr.update(value=segment_ctm_path, visible=True)
 def delete_non_tmp_video(video_path):
 		with gr.Column(scale=1):
 			gr.Markdown("## Input")
 			mic_in = gr.Audio(sources=["microphone"], type='filepath', label="Microphone input (max 4 mins)")
 			audio_file_in = gr.Audio(sources=["upload"], type='filepath', label="File upload (max 4 mins)")
 			ref_text = gr.Textbox(
 				label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
 				"Leave this field blank to use an ASR model's transcription as the reference text instead."
 			)
+			split_on_newline = gr.Checkbox(
+				True,
+                label="Separate text on new lines",
+            )
 			gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
 			with gr.Row():
 		with gr.Column(scale=1):
 			gr.Markdown("## Output")
+			video_out = gr.Video(label="Output Video")
+			text_out = gr.Textbox(label="Output Info", visible=False)
+			ass_file = gr.File(label="ASS File", visible=False)
+			word_ctm_file = gr.File(label="Word-level CTM File", visible=False)
+			segment_ctm_file = gr.File(label="Segment-level CTM File", visible=False)
 	with gr.Row():
 		gr.HTML(
 	submit_button.click(
 		fn=align,
+		inputs=[mic_in, audio_file_in, ref_text, col1, col2, col3, split_on_newline],
+		outputs=[video_out, text_out, non_tmp_output_video_filepath, ass_file, word_ctm_file, segment_ctm_file],
 	).then(
 		fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
 	)
+	example_2 = """ⵜⴰⴽⵟⵟⵓⵎⵜ ⵏ ⵜⵙⴰⴷⵓⴼⵜ.
+ⵙ ⵉⵙⵎ ⵏ ⵕⴱⴱⵉ ⴰⵎⴰⵍⵍⴰⵢ ⴰⵎⵙⵎⵓⵍⵍⵓ.
+ⴰⵎⵓⵢ ⵉ ⵕⴱⴱⵉ ⵍⵍⵉ ⵎⵓ ⵜⴳⴰ ⵜⵓⵍⵖⵉⵜ ⵜⵉⵏⵏⵙ, ⵕⴱⴱⵉ ⵏ ⵉⵖⵥⵡⴰⵕⵏ, ⴽⵔⴰ ⴳⴰⵏ.
+ⴰⵎⴰⵍⵍⴰⵢ ⴰⵎⵙⵎⵓⵍⵍⵓ, ⵖ ⵜⵎⵣⵡⴰⵔⵓⵜ ⵓⵍⴰ ⵖ ⵜⵎⴳⴳⴰⵔⵓⵜ.
+ⴰⴳⵍⵍⵉⴷ ⵏ ⵡⴰⵙⵙ ⵏ ⵓⴼⵔⴰ, ⴰⵙⵙ ⵏ ⵓⵙⵙⵃⵙⵓ, ⴽⵔⴰⵉⴳⴰⵜ ⵢⴰⵏ ⴷ ⵎⴰⴷ ⵉⵙⴽⵔ.
+ⵀⴰ ⵏⵏ ⴽⵢⵢⵉ ⴽⴰ ⵙ ⵏⵙⵙⵓⵎⴷ, ⴷ ⴽⵢⵢⵉ ⴽⴰ ⴰⴷ ⵏⵎⵎⵜⵔ.
+ⵙⵎⵓⵏ ⴰⵖ, ⵜⵎⵍⵜ ⴰⵖ, ⴰⵖⴰⵔⴰⵙ ⵢⵓⵖⴷⵏ.
+ⴰⵖⴰⵔⴰⵙ ⵏ ⵖⵡⵉⵍⵍⵉ ⵜⵙⵏⵏⵓⴼⴰⵜ, ⵓⵔ ⴷ ⴰⵢⵜ ⵜⵉⵢⵓⵔⵉ, ⵓⵍⴰ ⵉⵎⵓⴹⴹⴰⵕ."""
+	examples = gr.Examples(
+		examples=[
+			["common_voice_zgh_37837257.mp3", "ⵎⵍ ⵉⵢⵉ ⵎⴰⴷ ⴷ ⵜⴻⵜⵜⵎⵓⵏⴷ ⴰⴷ ⴰⴽ ⵎⵍⵖ ⵎⴰⴷ ⵜⴳⵉⴷ"],
+			["Voice1410.wav", example_2]
+		],
+		inputs=[audio_file_in, ref_text]
+	)
 demo.queue()
 demo.launch()