NeMo-Forced-Aligner

Running

App Files Files Community

erastorgueva-nv commited on Aug 9, 2023

Commit

6ffdd29

•

1 Parent(s): c7e8b60

Initial commit

Browse files

Files changed (11) hide show

align.py +352 -0
app.py +314 -0
packages.txt +3 -0
pre-requirements.txt +2 -0
requirements.txt +1 -0
utils/constants.py +19 -0
utils/data_prep.py +835 -0
utils/make_ass_files.py +462 -0
utils/make_ctm_files.py +114 -0
utils/make_output_manifest.py +35 -0
utils/viterbi_decoding.py +136 -0

align.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+import os
+from dataclasses import dataclass, field, is_dataclass
+from pathlib import Path
+from typing import List, Optional
+import torch
+from omegaconf import OmegaConf
+from utils.data_prep import (
+    add_t_start_end_to_utt_obj,
+    get_batch_starts_ends,
+    get_batch_variables,
+    get_manifest_lines_batch,
+    is_entry_in_all_lines,
+    is_entry_in_any_lines,
+)
+from utils.make_ass_files import make_ass_files
+from utils.make_ctm_files import make_ctm_files
+from utils.make_output_manifest import write_manifest_out_line
+from utils.viterbi_decoding import viterbi_decoding
+from nemo.collections.asr.models.ctc_models import EncDecCTCModel
+from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel
+from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR
+from nemo.collections.asr.parts.utils.transcribe_utils import setup_model
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+"""
+Align the utterances in manifest_filepath.
+Results are saved in ctm files in output_dir.
+Arguments:
+    pretrained_name: string specifying the name of a CTC NeMo ASR model which will be automatically downloaded
+        from NGC and used for generating the log-probs which we will use to do alignment.
+        Note: NFA can only use CTC models (not Transducer models) at the moment.
+    model_path: string specifying the local filepath to a CTC NeMo ASR model which will be used to generate the
+        log-probs which we will use to do alignment.
+        Note: NFA can only use CTC models (not Transducer models) at the moment.
+        Note: if a model_path is provided, it will override the pretrained_name.
+    manifest_filepath: filepath to the manifest of the data you want to align,
+        containing 'audio_filepath' and 'text' fields.
+    output_dir: the folder where output CTM files and new JSON manifest will be saved.
+    align_using_pred_text: if True, will transcribe the audio using the specified model and then use that transcription
+        as the reference text for the forced alignment.
+    transcribe_device: None, or a string specifying the device that will be used for generating log-probs (i.e. "transcribing").
+        The string needs to be in a format recognized by torch.device(). If None, NFA will set it to 'cuda' if it is available
+        (otherwise will set it to 'cpu').
+    viterbi_device: None, or string specifying the device that will be used for doing Viterbi decoding.
+        The string needs to be in a format recognized by torch.device(). If None, NFA will set it to 'cuda' if it is available
+        (otherwise will set it to 'cpu').
+    batch_size: int specifying batch size that will be used for generating log-probs and doing Viterbi decoding.
+    use_local_attention: boolean flag specifying whether to try to use local attention for the ASR Model (will only
+        work if the ASR Model is a Conformer model). If local attention is used, we will set the local attention context
+        size to [64,64].
+    additional_segment_grouping_separator: an optional string used to separate the text into smaller segments.
+        If this is not specified, then the whole text will be treated as a single segment.
+    remove_blank_tokens_from_ctm:  a boolean denoting whether to remove <blank> tokens from token-level output CTMs.
+    audio_filepath_parts_in_utt_id: int specifying how many of the 'parts' of the audio_filepath
+        we will use (starting from the final part of the audio_filepath) to determine the
+        utt_id that will be used in the CTM files. Note also that any spaces that are present in the audio_filepath
+        will be replaced with dashes, so as not to change the number of space-separated elements in the
+        CTM files.
+        e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 1 => utt_id will be "e1"
+        e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 2 => utt_id will be "d_e1"
+        e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 3 => utt_id will be "c_d_e1"
+    use_buffered_infer: False, if set True, using streaming to do get the logits for alignment
+                        This flag is useful when aligning large audio file.
+                        However, currently the chunk streaming inference does not support batch inference,
+                        which means even you set batch_size > 1, it will only infer one by one instead of doing
+                        the whole batch inference together.
+    chunk_len_in_secs: float chunk length in seconds
+    total_buffer_in_secs: float  Length of buffer (chunk + left and right padding) in seconds
+    chunk_batch_size: int batch size for buffered chunk inference,
+                      which will cut one audio into segments and do inference on chunk_batch_size segments at a time
+    simulate_cache_aware_streaming: False, if set True, using cache aware streaming to do get the logits for alignment
+    save_output_file_formats: List of strings specifying what type of output files to save (default: ["ctm", "ass"])
+    ctm_file_config: CTMFileConfig to specify the configuration of the output CTM files
+    ass_file_config: ASSFileConfig to specify the configuration of the output ASS files
+"""
+@dataclass
+class CTMFileConfig:
+    remove_blank_tokens: bool = False
+    # minimum duration (in seconds) for timestamps in the CTM.If any line in the CTM has a
+    # duration lower than this, it will be enlarged from the middle outwards until it
+    # meets the minimum_timestamp_duration, or reaches the beginning or end of the audio file.
+    # Note that this may cause timestamps to overlap.
+    minimum_timestamp_duration: float = 0
+@dataclass
+class ASSFileConfig:
+    fontsize: int = 20
+    vertical_alignment: str = "center"
+    # if resegment_text_to_fill_space is True, the ASS files will use new segments
+    # such that each segment will not take up more than (approximately) max_lines_per_segment
+    # when the ASS file is applied to a video
+    resegment_text_to_fill_space: bool = False
+    max_lines_per_segment: int = 2
+    text_already_spoken_rgb: List[int] = field(default_factory=lambda: [49, 46, 61])  # dark gray
+    text_being_spoken_rgb: List[int] = field(default_factory=lambda: [57, 171, 9])  # dark green
+    text_not_yet_spoken_rgb: List[int] = field(default_factory=lambda: [194, 193, 199])  # light gray
+@dataclass
+class AlignmentConfig:
+    # Required configs
+    pretrained_name: Optional[str] = None
+    model_path: Optional[str] = None
+    manifest_filepath: Optional[str] = None
+    output_dir: Optional[str] = None
+    # General configs
+    align_using_pred_text: bool = False
+    transcribe_device: Optional[str] = None
+    viterbi_device: Optional[str] = None
+    batch_size: int = 1
+    use_local_attention: bool = True
+    additional_segment_grouping_separator: Optional[str] = None
+    audio_filepath_parts_in_utt_id: int = 1
+    # Buffered chunked streaming configs
+    use_buffered_chunked_streaming: bool = False
+    chunk_len_in_secs: float = 1.6
+    total_buffer_in_secs: float = 4.0
+    chunk_batch_size: int = 32
+    # Cache aware streaming configs
+    simulate_cache_aware_streaming: Optional[bool] = False
+    # Output file configs
+    save_output_file_formats: List[str] = field(default_factory=lambda: ["ctm", "ass"])
+    ctm_file_config: CTMFileConfig = CTMFileConfig()
+    ass_file_config: ASSFileConfig = ASSFileConfig()
+@hydra_runner(config_name="AlignmentConfig", schema=AlignmentConfig)
+def main(cfg: AlignmentConfig):
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+    if is_dataclass(cfg):
+        cfg = OmegaConf.structured(cfg)
+    # Validate config
+    if cfg.model_path is None and cfg.pretrained_name is None:
+        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None")
+    if cfg.model_path is not None and cfg.pretrained_name is not None:
+        raise ValueError("One of cfg.model_path and cfg.pretrained_name must be None")
+    if cfg.manifest_filepath is None:
+        raise ValueError("cfg.manifest_filepath must be specified")
+    if cfg.output_dir is None:
+        raise ValueError("cfg.output_dir must be specified")
+    if cfg.batch_size < 1:
+        raise ValueError("cfg.batch_size cannot be zero or a negative number")
+    if cfg.additional_segment_grouping_separator == "" or cfg.additional_segment_grouping_separator == " ":
+        raise ValueError("cfg.additional_grouping_separator cannot be empty string or space character")
+    if cfg.ctm_file_config.minimum_timestamp_duration < 0:
+        raise ValueError("cfg.minimum_timestamp_duration cannot be a negative number")
+    if cfg.ass_file_config.vertical_alignment not in ["top", "center", "bottom"]:
+        raise ValueError("cfg.ass_file_config.vertical_alignment must be one of 'top', 'center' or 'bottom'")
+    for rgb_list in [
+        cfg.ass_file_config.text_already_spoken_rgb,
+        cfg.ass_file_config.text_already_spoken_rgb,
+        cfg.ass_file_config.text_already_spoken_rgb,
+    ]:
+        if len(rgb_list) != 3:
+            raise ValueError(
+                "cfg.ass_file_config.text_already_spoken_rgb,"
+                " cfg.ass_file_config.text_being_spoken_rgb,"
+                " and cfg.ass_file_config.text_already_spoken_rgb all need to contain"
+                " exactly 3 elements."
+            )
+    # Validate manifest contents
+    if not is_entry_in_all_lines(cfg.manifest_filepath, "audio_filepath"):
+        raise RuntimeError(
+            "At least one line in cfg.manifest_filepath does not contain an 'audio_filepath' entry. "
+            "All lines must contain an 'audio_filepath' entry."
+        )
+    if cfg.align_using_pred_text:
+        if is_entry_in_any_lines(cfg.manifest_filepath, "pred_text"):
+            raise RuntimeError(
+                "Cannot specify cfg.align_using_pred_text=True when the manifest at cfg.manifest_filepath "
+                "contains 'pred_text' entries. This is because the audio will be transcribed and may produce "
+                "a different 'pred_text'. This may cause confusion."
+            )
+    else:
+        if not is_entry_in_all_lines(cfg.manifest_filepath, "text"):
+            raise RuntimeError(
+                "At least one line in cfg.manifest_filepath does not contain a 'text' entry. "
+                "NFA requires all lines to contain a 'text' entry when cfg.align_using_pred_text=False."
+            )
+    # init devices
+    if cfg.transcribe_device is None:
+        transcribe_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    else:
+        transcribe_device = torch.device(cfg.transcribe_device)
+    logging.info(f"Device to be used for transcription step (`transcribe_device`) is {transcribe_device}")
+    if cfg.viterbi_device is None:
+        viterbi_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    else:
+        viterbi_device = torch.device(cfg.viterbi_device)
+    logging.info(f"Device to be used for viterbi step (`viterbi_device`) is {viterbi_device}")
+    if transcribe_device.type == 'cuda' or viterbi_device.type == 'cuda':
+        logging.warning(
+            'One or both of transcribe_device and viterbi_device are GPUs. If you run into OOM errors '
+            'it may help to change both devices to be the CPU.'
+        )
+    # load model
+    model, _ = setup_model(cfg, transcribe_device)
+    model.eval()
+    if isinstance(model, EncDecHybridRNNTCTCModel):
+        model.change_decoding_strategy(decoder_type="ctc")
+    if cfg.use_local_attention:
+        logging.info(
+            "Flag use_local_attention is set to True => will try to use local attention for model if it allows it"
+        )
+        model.change_attention_model(self_attention_model="rel_pos_local_attn", att_context_size=[64, 64])
+    if not (isinstance(model, EncDecCTCModel) or isinstance(model, EncDecHybridRNNTCTCModel)):
+        raise NotImplementedError(
+            f"Model is not an instance of NeMo EncDecCTCModel or ENCDecHybridRNNTCTCModel."
+            " Currently only instances of these models are supported"
+        )
+    if cfg.ctm_file_config.minimum_timestamp_duration > 0:
+        logging.warning(
+            f"cfg.ctm_file_config.minimum_timestamp_duration has been set to {cfg.ctm_file_config.minimum_timestamp_duration} seconds. "
+            "This may cause the alignments for some tokens/words/additional segments to be overlapping."
+        )
+    buffered_chunk_params = {}
+    if cfg.use_buffered_chunked_streaming:
+        model_cfg = copy.deepcopy(model._cfg)
+        OmegaConf.set_struct(model_cfg.preprocessor, False)
+        # some changes for streaming scenario
+        model_cfg.preprocessor.dither = 0.0
+        model_cfg.preprocessor.pad_to = 0
+        if model_cfg.preprocessor.normalize != "per_feature":
+            logging.error(
+                "Only EncDecCTCModelBPE models trained with per_feature normalization are supported currently"
+            )
+        # Disable config overwriting
+        OmegaConf.set_struct(model_cfg.preprocessor, True)
+        feature_stride = model_cfg.preprocessor['window_stride']
+        model_stride_in_secs = feature_stride * cfg.model_downsample_factor
+        total_buffer = cfg.total_buffer_in_secs
+        chunk_len = float(cfg.chunk_len_in_secs)
+        tokens_per_chunk = math.ceil(chunk_len / model_stride_in_secs)
+        mid_delay = math.ceil((chunk_len + (total_buffer - chunk_len) / 2) / model_stride_in_secs)
+        logging.info(f"tokens_per_chunk is {tokens_per_chunk}, mid_delay is {mid_delay}")
+        model = FrameBatchASR(
+            asr_model=model,
+            frame_len=chunk_len,
+            total_buffer=cfg.total_buffer_in_secs,
+            batch_size=cfg.chunk_batch_size,
+        )
+        buffered_chunk_params = {
+            "delay": mid_delay,
+            "model_stride_in_secs": model_stride_in_secs,
+            "tokens_per_chunk": tokens_per_chunk,
+        }
+    # get start and end line IDs of batches
+    starts, ends = get_batch_starts_ends(cfg.manifest_filepath, cfg.batch_size)
+    # init output_timestep_duration = None and we will calculate and update it during the first batch
+    output_timestep_duration = None
+    # init f_manifest_out
+    os.makedirs(cfg.output_dir, exist_ok=True)
+    tgt_manifest_name = str(Path(cfg.manifest_filepath).stem) + "_with_output_file_paths.json"
+    tgt_manifest_filepath = str(Path(cfg.output_dir) / tgt_manifest_name)
+    f_manifest_out = open(tgt_manifest_filepath, 'w')
+    # get alignment and save in CTM batch-by-batch
+    for start, end in zip(starts, ends):
+        manifest_lines_batch = get_manifest_lines_batch(cfg.manifest_filepath, start, end)
+        (log_probs_batch, y_batch, T_batch, U_batch, utt_obj_batch, output_timestep_duration,) = get_batch_variables(
+            manifest_lines_batch,
+            model,
+            cfg.additional_segment_grouping_separator,
+            cfg.align_using_pred_text,
+            cfg.audio_filepath_parts_in_utt_id,
+            output_timestep_duration,
+            cfg.simulate_cache_aware_streaming,
+            cfg.use_buffered_chunked_streaming,
+            buffered_chunk_params,
+        )
+        alignments_batch = viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device)
+        for utt_obj, alignment_utt in zip(utt_obj_batch, alignments_batch):
+            utt_obj = add_t_start_end_to_utt_obj(utt_obj, alignment_utt, output_timestep_duration)
+            if "ctm" in cfg.save_output_file_formats:
+                utt_obj = make_ctm_files(utt_obj, cfg.output_dir, cfg.ctm_file_config,)
+            if "ass" in cfg.save_output_file_formats:
+                utt_obj = make_ass_files(utt_obj, cfg.output_dir, cfg.ass_file_config)
+            write_manifest_out_line(
+                f_manifest_out, utt_obj,
+            )
+    f_manifest_out.close()
+    return None
+if __name__ == "__main__":
+    main()

app.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import gradio as gr
+import librosa
+import soundfile
+import tempfile
+import os
+import uuid
+import json
+import jieba
+import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.models import ASRModel
+from nemo.utils import logging
+from align import main, AlignmentConfig, ASSFileConfig
+SAMPLE_RATE = 16000
+# Pre-download and cache the model in disk space
+logging.setLevel(logging.ERROR)
+for tmp_model_name in [
+	"stt_en_fastconformer_hybrid_large_pc",
+	"stt_de_fastconformer_hybrid_large_pc",
+	"stt_es_fastconformer_hybrid_large_pc",
+	"stt_fr_conformer_ctc_large",
+	"stt_zh_citrinet_1024_gamma_0_25",
+]:
+	tmp_model = ASRModel.from_pretrained(tmp_model_name, map_location='cpu')
+	del tmp_model
+logging.setLevel(logging.INFO)
+def get_audio_data_and_duration(file):
+	data, sr = librosa.load(file)
+	if sr != SAMPLE_RATE:
+		data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
+	# monochannel
+	data = librosa.to_mono(data)
+	duration = librosa.get_duration(y=data, sr=SAMPLE_RATE)
+	return data, duration
+def get_char_tokens(text, model):
+	tokens = []
+	for character in text:
+		if character in model.decoder.vocabulary:
+			tokens.append(model.decoder.vocabulary.index(character))
+	else:
+		tokens.append(len(model.decoder.vocabulary))  # return unk token (same as blank token)
+	return tokens
+def get_S_prime_and_T(text, model_name, model, audio_duration):
+	# estimate T
+	if "citrinet" in model_name or "_fastconformer_" in model_name:
+		output_timestep_duration = 0.08
+	elif "_conformer_" in model_name:
+		output_timestep_duration = 0.04
+	elif "quartznet" in model_name:
+		output_timestep_duration = 0.02
+	else:
+		raise RuntimeError("unexpected model name")
+	T = int(audio_duration / output_timestep_duration) + 1
+	# calculate S_prime =  num tokens + num repetitions
+	if hasattr(model, 'tokenizer'):
+		all_tokens = model.tokenizer.text_to_ids(text)
+	elif hasattr(model.decoder, "vocabulary"):  # i.e. tokenization is simply character-based
+		all_tokens = get_char_tokens(text, model)
+	else:
+		raise RuntimeError("cannot obtain tokens from this model")
+	n_token_repetitions = 0
+	for i_tok in range(1, len(all_tokens)):
+		if all_tokens[i_tok] == all_tokens[i_tok - 1]:
+			n_token_repetitions += 1
+	S_prime = len(all_tokens) + n_token_repetitions
+	print('all_tokens', all_tokens)
+	print(len(all_tokens))
+	print(n_token_repetitions)
+	return S_prime, T
+def hex_to_rgb_list(hex_string):
+	hex_string = hex_string.lstrip("#")
+	r = int(hex_string[:2], 16)
+	g = int(hex_string[2:4], 16)
+	b = int(hex_string[4:], 16)
+	return [r, g, b]
+def delete_mp4s_except_given_filepath(filepath):
+	files_in_dir = os.listdir()
+	mp4_files_in_dir = [x for x in files_in_dir if x.endswith(".mp4")]
+	for mp4_file in mp4_files_in_dir:
+		if mp4_file != filepath:
+			print('deleting', mp4_file)
+			os.remove(mp4_file)
+def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Progress()):
+	# Create utt_id,  specify output_video_filepath and delete any MP4s
+	# that are not that filepath. These stray MP4s can be created
+	# if a user refreshes or exits the page while this 'align' function is executing.
+	# This deletion will not delete any other users' video as long as this 'align' function
+	# is run one at a time.
+	utt_id = uuid.uuid4()
+	output_video_filepath = f"{utt_id}.mp4"
+	delete_mp4s_except_given_filepath(output_video_filepath)
+	output_info = ""
+	progress(0, desc="Validating input")
+	# choose model
+	if lang in ["en", "de", "es"]:
+		model_name = f"stt_{lang}_fastconformer_hybrid_large_pc"
+	elif lang in ["fr"]:
+		model_name = f"stt_{lang}_conformer_ctc_large"
+	elif lang in ["zh"]:
+		model_name = f"stt_{lang}_citrinet_1024_gamma_0_25"
+	# decide which of Mic / File_Upload is used as input & do error handling
+	if (Microphone is not None) and (File_Upload is not None):
+		raise gr.Error("Please use either the microphone or file upload input - not both")
+	elif (Microphone is None) and (File_Upload is None):
+		raise gr.Error("You have to either use the microphone or upload an audio file")
+	elif Microphone is not None:
+		file = Microphone
+	else:
+		file = File_Upload
+	# check audio is not too long
+	audio_data, duration = get_audio_data_and_duration(file)
+	if duration > 4 * 60:
+		raise gr.Error(
+			f"Detected that uploaded audio has duration {duration/60:.1f} mins - please only upload audio of less than 4 mins duration"
+		)
+	# loading model
+	progress(0.1, desc="Loading speech recognition model")
+	model = ASRModel.from_pretrained(model_name)
+	if text:  # check input text is not too long compared to audio
+		S_prime, T = get_S_prime_and_T(text, model_name, model, duration)
+		if S_prime > T:
+			raise gr.Error(
+				f"The number of tokens in the input text is too long compared to the duration of the audio."
+				f" This model can handle {T} tokens + token repetitions at most. You have provided {S_prime} tokens + token repetitions. "
+				f" (Adjacent tokens that are not in the model's vocabulary are also counted as a token repetition.)"
+			)
+	with tempfile.TemporaryDirectory() as tmpdir:
+		audio_path = os.path.join(tmpdir, f'{utt_id}.wav')
+		soundfile.write(audio_path, audio_data, SAMPLE_RATE)
+		# getting the text if it hasn't been provided
+		if not text:
+			progress(0.2, desc="Transcribing audio")
+			text = model.transcribe([audio_path])[0]
+			if 'hybrid' in model_name:
+				text = text[0]
+			print('transcribed text:', text)
+			if text == "":
+				raise gr.Error(
+					"ERROR: the ASR model did not detect any speech in the input audio. Please upload audio with speech."
+				)
+			output_info += (
+				"You did not enter any input text, so the ASR model's transcription will be used:\n"
+				"--------------------------\n"
+				f"{text}\n"
+				"--------------------------\n"
+				f"You could try pasting the transcription into the text input box, correcting any"
+				" transcription errors, and clicking 'Submit' again."
+			)
+		if lang == "zh" and " " not in text:
+			# use jieba to add spaces between zh characters
+			text = " ".join(jieba.cut(text))
+		data = {
+			"audio_filepath": audio_path,
+			"text": text,
+		}
+		manifest_path = os.path.join(tmpdir, f"{utt_id}_manifest.json")
+		with open(manifest_path, 'w') as fout:
+			fout.write(f"{json.dumps(data)}\n")
+		# run alignment
+		if "|" in text:
+			resegment_text_to_fill_space = False
+		else:
+			resegment_text_to_fill_space = True
+		alignment_config = AlignmentConfig(
+			pretrained_name=model_name,
+			manifest_filepath=manifest_path,
+			output_dir=f"{tmpdir}/nfa_output/",
+			audio_filepath_parts_in_utt_id=1,
+			batch_size=1,
+			use_local_attention=True,
+			additional_segment_grouping_separator="|",
+			# transcribe_device='cpu',
+			# viterbi_device='cpu',
+			save_output_file_formats=["ass"],
+			ass_file_config=ASSFileConfig(
+				fontsize=45,
+				resegment_text_to_fill_space=resegment_text_to_fill_space,
+				max_lines_per_segment=4,
+				text_already_spoken_rgb=hex_to_rgb_list(col1),
+				text_being_spoken_rgb=hex_to_rgb_list(col2),
+				text_not_yet_spoken_rgb=hex_to_rgb_list(col3),
+			),
+		)
+		progress(0.5, desc="Aligning audio")
+		main(alignment_config)
+		progress(0.95, desc="Saving generated alignments")
+		if lang=="zh":
+			# make video file from the token-level ASS file
+			ass_file_for_video = f"{tmpdir}/nfa_output/ass/tokens/{utt_id}.ass"
+		else:
+			# make video file from the word-level ASS file
+			ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
+		ffmpeg_command = (
+			f"ffmpeg -y -i {audio_path} "
+			"-f lavfi -i color=c=white:s=1280x720:r=50 "
+			"-crf 1 -shortest -vcodec libx264 -pix_fmt yuv420p "
+			f"-vf 'ass={ass_file_for_video}' "
+			f"{output_video_filepath}"
+		)
+		os.system(ffmpeg_command)
+	return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath
+def delete_non_tmp_video(video_path):
+	if video_path:
+		if os.path.exists(video_path):
+			os.remove(video_path)
+	return None
+with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
+	non_tmp_output_video_filepath = gr.State([])
+	with gr.Row():
+		with gr.Column():
+			gr.Markdown("# NeMo Forced Aligner")
+			gr.Markdown(
+				"Demo for [NeMo Forced Aligner](https://github.com/NVIDIA/NeMo/tree/main/tools/nemo_forced_aligner) (NFA). "
+				"Upload audio and (optionally) the text spoken in the audio to generate a video where each part of the text will be highlighted as it is spoken. ",
+			)
+	with gr.Row():
+		with gr.Column(scale=1):
+			gr.Markdown("## Input")
+			lang_drop = gr.Dropdown(choices=["de", "en", "es", "fr", "zh"], value="en", label="Audio language",)
+			mic_in = gr.Audio(source="microphone", type='filepath', label="Microphone input (max 4 mins)")
+			audio_file_in = gr.Audio(source="upload", type='filepath', label="File upload (max 4 mins)")
+			ref_text = gr.Textbox(
+				label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
+				"Leave this field blank to use an ASR model's transcription as the reference text instead."
+			)
+			gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
+			with gr.Row():
+				col1 = gr.ColorPicker(label="text already spoken", value="#fcba03")
+				col2 = gr.ColorPicker(label="text being spoken", value="#bf45bf")
+				col3 = gr.ColorPicker(label="text to be spoken", value="#3e1af0")
+			submit_button = gr.Button("Submit")
+		with gr.Column(scale=1):
+			gr.Markdown("## Output")
+			video_out = gr.Video(label="output video")
+			text_out = gr.Textbox(label="output info", visible=False)
+	submit_button.click(
+		fn=align,
+		inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,],
+		outputs=[video_out, text_out, non_tmp_output_video_filepath],
+	).then(
+		fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
+	)
+demo.queue()
+demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+ffmpeg
+libsndfile1
+build-essential

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Cython
2	+ torch

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ nemo_toolkit[all]

utils/constants.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+BLANK_TOKEN = "<b>"
+SPACE_TOKEN = "<space>"
+V_NEGATIVE_NUM = -3.4e38  # this is just above the most negative number in torch.float32

utils/data_prep.py ADDED Viewed

	@@ -0,0 +1,835 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Union
+import soundfile as sf
+import torch
+from tqdm.auto import tqdm
+from utils.constants import BLANK_TOKEN, SPACE_TOKEN, V_NEGATIVE_NUM
+from nemo.utils import logging
+def _get_utt_id(audio_filepath, audio_filepath_parts_in_utt_id):
+    fp_parts = Path(audio_filepath).parts[-audio_filepath_parts_in_utt_id:]
+    utt_id = Path("_".join(fp_parts)).stem
+    utt_id = utt_id.replace(" ", "-")  # replace any spaces in the filepath with dashes
+    return utt_id
+def get_batch_starts_ends(manifest_filepath, batch_size):
+    """
+    Get the start and end ids of the lines we will use for each 'batch'.
+    """
+    with open(manifest_filepath, 'r') as f:
+        num_lines_in_manifest = sum(1 for _ in f)
+    starts = [x for x in range(0, num_lines_in_manifest, batch_size)]
+    ends = [x - 1 for x in starts]
+    ends.pop(0)
+    ends.append(num_lines_in_manifest)
+    return starts, ends
+def is_entry_in_any_lines(manifest_filepath, entry):
+    """
+    Returns True if entry is a key in any of the JSON lines in manifest_filepath
+    """
+    entry_in_manifest = False
+    with open(manifest_filepath, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            if entry in data:
+                entry_in_manifest = True
+    return entry_in_manifest
+def is_entry_in_all_lines(manifest_filepath, entry):
+    """
+    Returns True is entry is a key in all of the JSON lines in manifest_filepath.
+    """
+    with open(manifest_filepath, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            if entry not in data:
+                return False
+    return True
+def get_manifest_lines_batch(manifest_filepath, start, end):
+    manifest_lines_batch = []
+    with open(manifest_filepath, "r", encoding="utf-8-sig") as f:
+        for line_i, line in enumerate(f):
+            if line_i >= start and line_i <= end:
+                data = json.loads(line)
+                if "text" in data:
+                    # remove any BOM, any duplicated spaces, convert any
+                    # newline chars to spaces
+                    data["text"] = data["text"].replace("\ufeff", "")
+                    data["text"] = " ".join(data["text"].split())
+                manifest_lines_batch.append(data)
+            if line_i == end:
+                break
+    return manifest_lines_batch
+def get_char_tokens(text, model):
+    tokens = []
+    for character in text:
+        if character in model.decoder.vocabulary:
+            tokens.append(model.decoder.vocabulary.index(character))
+        else:
+            tokens.append(len(model.decoder.vocabulary))  # return unk token (same as blank token)
+    return tokens
+def is_sub_or_superscript_pair(ref_text, text):
+    """returns True if ref_text is a subscript or superscript version of text"""
+    sub_or_superscript_to_num = {
+        "⁰": "0",
+        "¹": "1",
+        "²": "2",
+        "³": "3",
+        "⁴": "4",
+        "⁵": "5",
+        "⁶": "6",
+        "⁷": "7",
+        "⁸": "8",
+        "⁹": "9",
+        "₀": "0",
+        "₁": "1",
+        "₂": "2",
+        "₃": "3",
+        "₄": "4",
+        "₅": "5",
+        "₆": "6",
+        "₇": "7",
+        "₈": "8",
+        "₉": "9",
+    }
+    if text in sub_or_superscript_to_num:
+        if sub_or_superscript_to_num[text] == ref_text:
+            return True
+    return False
+def restore_token_case(word, word_tokens):
+    # remove repeated "▁" and "_" from word as that is what the tokenizer will do
+    while "▁▁" in word:
+        word = word.replace("▁▁", "▁")
+    while "__" in word:
+        word = word.repalce("__", "_")
+    word_tokens_cased = []
+    word_char_pointer = 0
+    for token in word_tokens:
+        token_cased = ""
+        for token_char in token:
+            if token_char == word[word_char_pointer]:
+                token_cased += token_char
+                word_char_pointer += 1
+            else:
+                if token_char.upper() == word[word_char_pointer] or is_sub_or_superscript_pair(
+                    token_char, word[word_char_pointer]
+                ):
+                    token_cased += token_char.upper()
+                    word_char_pointer += 1
+                else:
+                    if token_char == "▁" or token_char == "_":
+                        if word[word_char_pointer] == "▁" or word[word_char_pointer] == "_":
+                            token_cased += token_char
+                            word_char_pointer += 1
+                        elif word_char_pointer == 0:
+                            token_cased += token_char
+                    else:
+                        raise RuntimeError(
+                            f"Unexpected error - failed to recover capitalization of tokens for word {word}"
+                        )
+        word_tokens_cased.append(token_cased)
+    return word_tokens_cased
+@dataclass
+class Token:
+    text: str = None
+    text_cased: str = None
+    s_start: int = None
+    s_end: int = None
+    t_start: float = None
+    t_end: float = None
+@dataclass
+class Word:
+    text: str = None
+    s_start: int = None
+    s_end: int = None
+    t_start: float = None
+    t_end: float = None
+    tokens: List[Token] = field(default_factory=list)
+@dataclass
+class Segment:
+    text: str = None
+    s_start: int = None
+    s_end: int = None
+    t_start: float = None
+    t_end: float = None
+    words_and_tokens: List[Union[Word, Token]] = field(default_factory=list)
+@dataclass
+class Utterance:
+    token_ids_with_blanks: List[int] = field(default_factory=list)
+    segments_and_tokens: List[Union[Segment, Token]] = field(default_factory=list)
+    text: str = None
+    pred_text: str = None
+    audio_filepath: str = None
+    utt_id: str = None
+    saved_output_files: dict = field(default_factory=dict)
+def get_utt_obj(
+    text, model, separator, T, audio_filepath, utt_id,
+):
+    """
+    Function to create an Utterance object and add all necessary information to it except
+        for timings of the segments / words / tokens according to the alignment - that will
+        be done later in a different function, after the alignment is done.
+        The Utterance object has a list segments_and_tokens which contains Segment objects and
+        Token objects (for blank tokens in between segments).
+        Within the Segment objects, there is a list words_and_tokens which contains Word objects and
+        Token objects (for blank tokens in between words).
+        Within the Word objects, there is a list tokens tokens which contains Token objects for
+        blank and non-blank tokens.
+        We will be building up these lists in this function. This data structure will then be useful for
+        generating the various output files that we wish to save.
+    """
+    if not separator:  # if separator is not defined - treat the whole text as one segment
+        segments = [text]
+    else:
+        segments = text.split(separator)
+    # remove any spaces at start and end of segments
+    segments = [seg.strip() for seg in segments]
+    # remove any empty segments
+    segments = [seg for seg in segments if len(seg) > 0]
+    utt = Utterance(text=text, audio_filepath=audio_filepath, utt_id=utt_id,)
+    # build up lists: token_ids_with_blanks, segments_and_tokens.
+    # The code for these is different depending on whether we use char-based tokens or not
+    if hasattr(model, 'tokenizer'):
+        if hasattr(model, 'blank_id'):
+            BLANK_ID = model.blank_id
+        else:
+            BLANK_ID = len(model.tokenizer.vocab)  # TODO: check
+        utt.token_ids_with_blanks = [BLANK_ID]
+        # check for text being 0 length
+        if len(text) == 0:
+            return utt
+        # check for # tokens + token repetitions being > T
+        all_tokens = model.tokenizer.text_to_ids(text)
+        n_token_repetitions = 0
+        for i_tok in range(1, len(all_tokens)):
+            if all_tokens[i_tok] == all_tokens[i_tok - 1]:
+                n_token_repetitions += 1
+        if len(all_tokens) + n_token_repetitions > T:
+            logging.info(
+                f"Utterance {utt_id} has too many tokens compared to the audio file duration."
+                " Will not generate output alignment files for this utterance."
+            )
+            return utt
+        # build up data structures containing segments/words/tokens
+        utt.segments_and_tokens.append(Token(text=BLANK_TOKEN, text_cased=BLANK_TOKEN, s_start=0, s_end=0,))
+        segment_s_pointer = 1  # first segment will start at s=1 because s=0 is a blank
+        word_s_pointer = 1  # first word will start at s=1 because s=0 is a blank
+        for segment in segments:
+            # add the segment to segment_info and increment the segment_s_pointer
+            segment_tokens = model.tokenizer.text_to_tokens(segment)
+            utt.segments_and_tokens.append(
+                Segment(
+                    text=segment,
+                    s_start=segment_s_pointer,
+                    # segment_tokens do not contain blanks => need to muliply by 2
+                    # s_end needs to be the index of the final token (including blanks) of the current segment:
+                    # segment_s_pointer + len(segment_tokens) * 2 is the index of the first token of the next segment =>
+                    # => need to subtract 2
+                    s_end=segment_s_pointer + len(segment_tokens) * 2 - 2,
+                )
+            )
+            segment_s_pointer += (
+                len(segment_tokens) * 2
+            )  # multiply by 2 to account for blanks (which are not present in segment_tokens)
+            words = segment.split(" ")  # we define words to be space-separated sub-strings
+            for word_i, word in enumerate(words):
+                word_tokens = model.tokenizer.text_to_tokens(word)
+                word_token_ids = model.tokenizer.text_to_ids(word)
+                word_tokens_cased = restore_token_case(word, word_tokens)
+                # add the word to word_info and increment the word_s_pointer
+                utt.segments_and_tokens[-1].words_and_tokens.append(
+                    # word_tokens do not contain blanks => need to muliply by 2
+                    # s_end needs to be the index of the final token (including blanks) of the current word:
+                    # word_s_pointer + len(word_tokens) * 2 is the index of the first token of the next word =>
+                    # => need to subtract 2
+                    Word(text=word, s_start=word_s_pointer, s_end=word_s_pointer + len(word_tokens) * 2 - 2)
+                )
+                word_s_pointer += (
+                    len(word_tokens) * 2
+                )  # multiply by 2 to account for blanks (which are not present in word_tokens)
+                for token_i, (token, token_id, token_cased) in enumerate(
+                    zip(word_tokens, word_token_ids, word_tokens_cased)
+                ):
+                    # add the text tokens and the blanks in between them
+                    # to our token-based variables
+                    utt.token_ids_with_blanks.extend([token_id, BLANK_ID])
+                    # adding Token object for non-blank token
+                    utt.segments_and_tokens[-1].words_and_tokens[-1].tokens.append(
+                        Token(
+                            text=token,
+                            text_cased=token_cased,
+                            # utt.token_ids_with_blanks has the form [...., <this non-blank token>, <blank>] =>
+                            # => if do len(utt.token_ids_with_blanks) - 1 you get the index of the final <blank>
+                            # => we want to do len(utt.token_ids_with_blanks) - 2 to get the index of <this non-blank token>
+                            s_start=len(utt.token_ids_with_blanks) - 2,
+                            # s_end is same as s_start since the token only occupies one element in the list
+                            s_end=len(utt.token_ids_with_blanks) - 2,
+                        )
+                    )
+                    # adding Token object for blank tokens in between the tokens of the word
+                    # (ie do not add another blank if you have reached the end)
+                    if token_i < len(word_tokens) - 1:
+                        utt.segments_and_tokens[-1].words_and_tokens[-1].tokens.append(
+                            Token(
+                                text=BLANK_TOKEN,
+                                text_cased=BLANK_TOKEN,
+                                # utt.token_ids_with_blanks has the form [...., <this blank token>] =>
+                                # => if do len(utt.token_ids_with_blanks) -1 you get the index of this <blank>
+                                s_start=len(utt.token_ids_with_blanks) - 1,
+                                # s_end is same as s_start since the token only occupies one element in the list
+                                s_end=len(utt.token_ids_with_blanks) - 1,
+                            )
+                        )
+                # add a Token object for blanks in between words in this segment
+                # (but only *in between* - do not add the token if it is after the final word)
+                if word_i < len(words) - 1:
+                    utt.segments_and_tokens[-1].words_and_tokens.append(
+                        Token(
+                            text=BLANK_TOKEN,
+                            text_cased=BLANK_TOKEN,
+                            # utt.token_ids_with_blanks has the form [...., <this blank token>] =>
+                            # => if do len(utt.token_ids_with_blanks) -1 you get the index of this <blank>
+                            s_start=len(utt.token_ids_with_blanks) - 1,
+                            # s_end is same as s_start since the token only occupies one element in the list
+                            s_end=len(utt.token_ids_with_blanks) - 1,
+                        )
+                    )
+            # add the blank token in between segments/after the final segment
+            utt.segments_and_tokens.append(
+                Token(
+                    text=BLANK_TOKEN,
+                    text_cased=BLANK_TOKEN,
+                    # utt.token_ids_with_blanks has the form [...., <this blank token>] =>
+                    # => if do len(utt.token_ids_with_blanks) -1 you get the index of this <blank>
+                    s_start=len(utt.token_ids_with_blanks) - 1,
+                    # s_end is same as s_start since the token only occupies one element in the list
+                    s_end=len(utt.token_ids_with_blanks) - 1,
+                )
+            )
+        return utt
+    elif hasattr(model.decoder, "vocabulary"):  # i.e. tokenization is simply character-based
+        BLANK_ID = len(model.decoder.vocabulary)  # TODO: check this is correct
+        SPACE_ID = model.decoder.vocabulary.index(" ")
+        utt.token_ids_with_blanks = [BLANK_ID]
+        # check for text being 0 length
+        if len(text) == 0:
+            return utt
+        # check for # tokens + token repetitions being > T
+        all_tokens = get_char_tokens(text, model)
+        n_token_repetitions = 0
+        for i_tok in range(1, len(all_tokens)):
+            if all_tokens[i_tok] == all_tokens[i_tok - 1]:
+                n_token_repetitions += 1
+        if len(all_tokens) + n_token_repetitions > T:
+            logging.info(
+                f"Utterance {utt_id} has too many tokens compared to the audio file duration."
+                " Will not generate output alignment files for this utterance."
+            )
+            return utt
+        # build up data structures containing segments/words/tokens
+        utt.segments_and_tokens.append(Token(text=BLANK_TOKEN, text_cased=BLANK_TOKEN, s_start=0, s_end=0,))
+        segment_s_pointer = 1  # first segment will start at s=1 because s=0 is a blank
+        word_s_pointer = 1  # first word will start at s=1 because s=0 is a blank
+        for i_segment, segment in enumerate(segments):
+            # add the segment to segment_info and increment the segment_s_pointer
+            segment_tokens = get_char_tokens(segment, model)
+            utt.segments_and_tokens.append(
+                Segment(
+                    text=segment,
+                    s_start=segment_s_pointer,
+                    # segment_tokens do not contain blanks => need to muliply by 2
+                    # s_end needs to be the index of the final token (including blanks) of the current segment:
+                    # segment_s_pointer + len(segment_tokens) * 2 is the index of the first token of the next segment =>
+                    # => need to subtract 2
+                    s_end=segment_s_pointer + len(segment_tokens) * 2 - 2,
+                )
+            )
+            # for correct calculation: multiply len(segment_tokens) by 2 to account for blanks (which are not present in segment_tokens)
+            # and + 2 to account for [<token for space in between segments>, <blank token after that space token>]
+            segment_s_pointer += len(segment_tokens) * 2 + 2
+            words = segment.split(" ")  # we define words to be space-separated substrings
+            for i_word, word in enumerate(words):
+                # convert string to list of characters
+                word_tokens = list(word)
+                # convert list of characters to list of their ids in the vocabulary
+                word_token_ids = get_char_tokens(word, model)
+                # add the word to word_info and increment the word_s_pointer
+                utt.segments_and_tokens[-1].words_and_tokens.append(
+                    # note for s_end:
+                    # word_tokens do not contain blanks => need to muliply by 2
+                    # s_end needs to be the index of the final token (including blanks) of the current word:
+                    # word_s_pointer + len(word_tokens) * 2 is the index of the first token of the next word =>
+                    # => need to subtract 2
+                    Word(text=word, s_start=word_s_pointer, s_end=word_s_pointer + len(word_tokens) * 2 - 2)
+                )
+                # for correct calculation: multiply len(word_tokens) by 2 to account for blanks (which are not present in word_tokens)
+                # and + 2 to account for [<token for space in between words>, <blank token after that space token>]
+                word_s_pointer += len(word_tokens) * 2 + 2
+                for token_i, (token, token_id) in enumerate(zip(word_tokens, word_token_ids)):
+                    # add the text tokens and the blanks in between them
+                    # to our token-based variables
+                    utt.token_ids_with_blanks.extend([token_id])
+                    utt.segments_and_tokens[-1].words_and_tokens[-1].tokens.append(
+                        Token(
+                            text=token,
+                            text_cased=token,
+                            # utt.token_ids_with_blanks has the form [..., <this non-blank token>]
+                            # => do len(utt.token_ids_with_blanks) - 1 to get the index of this non-blank token
+                            s_start=len(utt.token_ids_with_blanks) - 1,
+                            # s_end is same as s_start since the token only occupies one element in the list
+                            s_end=len(utt.token_ids_with_blanks) - 1,
+                        )
+                    )
+                    if token_i < len(word_tokens) - 1:  # only add blank tokens that are in the middle of words
+                        utt.token_ids_with_blanks.extend([BLANK_ID])
+                        utt.segments_and_tokens[-1].words_and_tokens[-1].tokens.append(
+                            Token(
+                                text=BLANK_TOKEN,
+                                text_cased=BLANK_TOKEN,
+                                # utt.token_ids_with_blanks has the form [..., <this blank token>]
+                                # => do len(utt.token_ids_with_blanks) - 1 to get the index of this blank token
+                                s_start=len(utt.token_ids_with_blanks) - 1,
+                                # s_end is same as s_start since the token only occupies one element in the list
+                                s_end=len(utt.token_ids_with_blanks) - 1,
+                            )
+                        )
+                # add space token (and the blanks around it) unless this is the final word in a segment
+                if i_word < len(words) - 1:
+                    utt.token_ids_with_blanks.extend([BLANK_ID, SPACE_ID, BLANK_ID])
+                    utt.segments_and_tokens[-1].words_and_tokens.append(
+                        Token(
+                            text=BLANK_TOKEN,
+                            text_cased=BLANK_TOKEN,
+                            # utt.token_ids_with_blanks has the form
+                            # [..., <final token of previous word>, <blank token>, <space token>, <blank token>]
+                            # => do len(utt.token_ids_with_blanks) - 3 to get the index of the blank token before the space token
+                            s_start=len(utt.token_ids_with_blanks) - 3,
+                            # s_end is same as s_start since the token only occupies one element in the list
+                            s_end=len(utt.token_ids_with_blanks) - 3,
+                        )
+                    )
+                    utt.segments_and_tokens[-1].words_and_tokens.append(
+                        Token(
+                            text=SPACE_TOKEN,
+                            text_cased=SPACE_TOKEN,
+                            # utt.token_ids_with_blanks has the form
+                            # [..., <final token of previous word>, <blank token>, <space token>, <blank token>]
+                            # => do len(utt.token_ids_with_blanks) - 2 to get the index of the space token
+                            s_start=len(utt.token_ids_with_blanks) - 2,
+                            # s_end is same as s_start since the token only occupies one element in the list
+                            s_end=len(utt.token_ids_with_blanks) - 2,
+                        )
+                    )
+                    utt.segments_and_tokens[-1].words_and_tokens.append(
+                        Token(
+                            text=BLANK_TOKEN,
+                            text_cased=BLANK_TOKEN,
+                            # utt.token_ids_with_blanks has the form
+                            # [..., <final token of previous word>, <blank token>, <space token>, <blank token>]
+                            # => do len(utt.token_ids_with_blanks) - 1 to get the index of the blank token after the space token
+                            s_start=len(utt.token_ids_with_blanks) - 1,
+                            # s_end is same as s_start since the token only occupies one element in the list
+                            s_end=len(utt.token_ids_with_blanks) - 1,
+                        )
+                    )
+            # add a blank to the segment, and add a space after if this is not the final segment
+            utt.token_ids_with_blanks.extend([BLANK_ID])
+            utt.segments_and_tokens.append(
+                Token(
+                    text=BLANK_TOKEN,
+                    text_cased=BLANK_TOKEN,
+                    # utt.token_ids_with_blanks has the form [..., <this blank token>]
+                    # => do len(utt.token_ids_with_blanks) - 1 to get the index of this blank token
+                    s_start=len(utt.token_ids_with_blanks) - 1,
+                    # s_end is same as s_start since the token only occupies one element in the list
+                    s_end=len(utt.token_ids_with_blanks) - 1,
+                )
+            )
+            if i_segment < len(segments) - 1:
+                utt.token_ids_with_blanks.extend([SPACE_ID, BLANK_ID])
+                utt.segments_and_tokens.append(
+                    Token(
+                        text=SPACE_TOKEN,
+                        text_cased=SPACE_TOKEN,
+                        # utt.token_ids_with_blanks has the form
+                        # [..., <space token>, <blank token>]
+                        # => do len(utt.token_ids_with_blanks) - 2 to get the index of the space token
+                        s_start=len(utt.token_ids_with_blanks) - 2,
+                        # s_end is same as s_start since the token only occupies one element in the list
+                        s_end=len(utt.token_ids_with_blanks) - 2,
+                    )
+                )
+                utt.segments_and_tokens.append(
+                    Token(
+                        text=BLANK_TOKEN,
+                        text_cased=BLANK_TOKEN,
+                        # utt.token_ids_with_blanks has the form
+                        # [..., <space token>, <blank token>]
+                        # => do len(utt.token_ids_with_blanks) - 1 to get the index of the blank token
+                        s_start=len(utt.token_ids_with_blanks) - 1,
+                        # s_end is same as s_start since the token only occupies one element in the list
+                        s_end=len(utt.token_ids_with_blanks) - 1,
+                    )
+                )
+        return utt
+    else:
+        raise RuntimeError("Cannot get tokens of this model.")
+def add_t_start_end_to_utt_obj(utt_obj, alignment_utt, output_timestep_duration):
+    """
+    Function to add t_start and t_end (representing time in seconds) to the Utterance object utt_obj.
+    Args:
+        utt_obj: Utterance object to which we will add t_start and t_end for its
+            constituent segments/words/tokens.
+        alignment_utt: a list of ints indicating which token does the alignment pass through at each
+            timestep (will take the form [0, 0, 1, 1, ..., <num of tokens including blanks in uterance>]).
+        output_timestep_duration: a float indicating the duration of a single output timestep from
+            the ASR Model.
+    Returns:
+        utt_obj: updated Utterance object.
+    """
+    # General idea for the algorithm of how we add t_start and t_end
+    # the timestep where a token s starts is the location of the first appearance of s_start in alignment_utt
+    # the timestep where a token s ends is the location of the final appearance of s_end in alignment_utt
+    # We will make dictionaries num_to_first_alignment_appearance and
+    # num_to_last_appearance and use that to update all of
+    # the t_start and t_end values in utt_obj.
+    # We will put t_start = t_end = -1 for tokens that are skipped (should only be blanks)
+    num_to_first_alignment_appearance = dict()
+    num_to_last_alignment_appearance = dict()
+    prev_s = -1  # use prev_s to keep track of when the s changes
+    for t, s in enumerate(alignment_utt):
+        if s > prev_s:
+            num_to_first_alignment_appearance[s] = t
+            if prev_s >= 0:  # dont record prev_s = -1
+                num_to_last_alignment_appearance[prev_s] = t - 1
+        prev_s = s
+    # add last appearance of the final s
+    num_to_last_alignment_appearance[prev_s] = len(alignment_utt) - 1
+    # update all the t_start and t_end in utt_obj
+    for segment_or_token in utt_obj.segments_and_tokens:
+        if type(segment_or_token) is Segment:
+            segment = segment_or_token
+            segment.t_start = num_to_first_alignment_appearance[segment.s_start] * output_timestep_duration
+            segment.t_end = (num_to_last_alignment_appearance[segment.s_end] + 1) * output_timestep_duration
+            for word_or_token in segment.words_and_tokens:
+                if type(word_or_token) is Word:
+                    word = word_or_token
+                    word.t_start = num_to_first_alignment_appearance[word.s_start] * output_timestep_duration
+                    word.t_end = (num_to_last_alignment_appearance[word.s_end] + 1) * output_timestep_duration
+                    for token in word.tokens:
+                        if token.s_start in num_to_first_alignment_appearance:
+                            token.t_start = num_to_first_alignment_appearance[token.s_start] * output_timestep_duration
+                        else:
+                            token.t_start = -1
+                        if token.s_end in num_to_last_alignment_appearance:
+                            token.t_end = (
+                                num_to_last_alignment_appearance[token.s_end] + 1
+                            ) * output_timestep_duration
+                        else:
+                            token.t_end = -1
+                else:
+                    token = word_or_token
+                    if token.s_start in num_to_first_alignment_appearance:
+                        token.t_start = num_to_first_alignment_appearance[token.s_start] * output_timestep_duration
+                    else:
+                        token.t_start = -1
+                    if token.s_end in num_to_last_alignment_appearance:
+                        token.t_end = (num_to_last_alignment_appearance[token.s_end] + 1) * output_timestep_duration
+                    else:
+                        token.t_end = -1
+        else:
+            token = segment_or_token
+            if token.s_start in num_to_first_alignment_appearance:
+                token.t_start = num_to_first_alignment_appearance[token.s_start] * output_timestep_duration
+            else:
+                token.t_start = -1
+            if token.s_end in num_to_last_alignment_appearance:
+                token.t_end = (num_to_last_alignment_appearance[token.s_end] + 1) * output_timestep_duration
+            else:
+                token.t_end = -1
+    return utt_obj
+def get_batch_variables(
+    manifest_lines_batch,
+    model,
+    separator,
+    align_using_pred_text,
+    audio_filepath_parts_in_utt_id,
+    output_timestep_duration,
+    simulate_cache_aware_streaming=False,
+    use_buffered_chunked_streaming=False,
+    buffered_chunk_params={},
+):
+    """
+    Returns:
+        log_probs, y, T, U (y and U are s.t. every other token is a blank) - these are the tensors we will need
+            during Viterbi decoding.
+        utt_obj_batch: a list of Utterance objects for every utterance in the batch.
+        output_timestep_duration: a float indicating the duration of a single output timestep from
+            the ASR Model.
+    """
+    # get hypotheses by calling 'transcribe'
+    # we will use the output log_probs, the duration of the log_probs,
+    # and (optionally) the predicted ASR text from the hypotheses
+    audio_filepaths_batch = [line["audio_filepath"] for line in manifest_lines_batch]
+    B = len(audio_filepaths_batch)
+    log_probs_list_batch = []
+    T_list_batch = []
+    pred_text_batch = []
+    if not use_buffered_chunked_streaming:
+        if not simulate_cache_aware_streaming:
+            with torch.no_grad():
+                hypotheses = model.transcribe(audio_filepaths_batch, return_hypotheses=True, batch_size=B)
+        else:
+            with torch.no_grad():
+                hypotheses = model.transcribe_simulate_cache_aware_streaming(
+                    audio_filepaths_batch, return_hypotheses=True, batch_size=B
+                )
+        # if hypotheses form a tuple (from Hybrid model), extract just "best" hypothesis
+        if type(hypotheses) == tuple and len(hypotheses) == 2:
+            hypotheses = hypotheses[0]
+        for hypothesis in hypotheses:
+            log_probs_list_batch.append(hypothesis.y_sequence)
+            T_list_batch.append(hypothesis.y_sequence.shape[0])
+            pred_text_batch.append(hypothesis.text)
+    else:
+        delay = buffered_chunk_params["delay"]
+        model_stride_in_secs = buffered_chunk_params["model_stride_in_secs"]
+        tokens_per_chunk = buffered_chunk_params["tokens_per_chunk"]
+        for l in tqdm(audio_filepaths_batch, desc="Sample:"):
+            model.reset()
+            model.read_audio_file(l, delay, model_stride_in_secs)
+            hyp, logits = model.transcribe(tokens_per_chunk, delay, keep_logits=True)
+            log_probs_list_batch.append(logits)
+            T_list_batch.append(logits.shape[0])
+            pred_text_batch.append(hyp)
+    # we loop over every line in the manifest that is in our current batch,
+    # and record the y (list of tokens, including blanks), U (list of lengths of y) and
+    # token_info_batch, word_info_batch, segment_info_batch
+    y_list_batch = []
+    U_list_batch = []
+    utt_obj_batch = []
+    for i_line, line in enumerate(manifest_lines_batch):
+        if align_using_pred_text:
+            gt_text_for_alignment = " ".join(pred_text_batch[i_line].split())
+        else:
+            gt_text_for_alignment = line["text"]
+        utt_obj = get_utt_obj(
+            gt_text_for_alignment,
+            model,
+            separator,
+            T_list_batch[i_line],
+            audio_filepaths_batch[i_line],
+            _get_utt_id(audio_filepaths_batch[i_line], audio_filepath_parts_in_utt_id),
+        )
+        # update utt_obj.pred_text or utt_obj.text
+        if align_using_pred_text:
+            utt_obj.pred_text = pred_text_batch[i_line]
+            if len(utt_obj.pred_text) == 0:
+                logging.info(
+                    f"'pred_text' of utterance {utt_obj.utt_id} is empty - we will not generate"
+                    " any output alignment files for this utterance"
+                )
+            if "text" in line:
+                utt_obj.text = line["text"]  # keep the text as we will save it in the output manifest
+        else:
+            utt_obj.text = line["text"]
+            if len(utt_obj.text) == 0:
+                logging.info(
+                    f"'text' of utterance {utt_obj.utt_id} is empty - we will not generate"
+                    " any output alignment files for this utterance"
+                )
+        y_list_batch.append(utt_obj.token_ids_with_blanks)
+        U_list_batch.append(len(utt_obj.token_ids_with_blanks))
+        utt_obj_batch.append(utt_obj)
+    # turn log_probs, y, T, U into dense tensors for fast computation during Viterbi decoding
+    T_max = max(T_list_batch)
+    U_max = max(U_list_batch)
+    #  V = the number of tokens in the vocabulary + 1 for the blank token.
+    if hasattr(model, 'tokenizer'):
+        V = len(model.tokenizer.vocab) + 1
+    else:
+        V = len(model.decoder.vocabulary) + 1
+    T_batch = torch.tensor(T_list_batch)
+    U_batch = torch.tensor(U_list_batch)
+    # make log_probs_batch tensor of shape (B x T_max x V)
+    log_probs_batch = V_NEGATIVE_NUM * torch.ones((B, T_max, V))
+    for b, log_probs_utt in enumerate(log_probs_list_batch):
+        t = log_probs_utt.shape[0]
+        log_probs_batch[b, :t, :] = log_probs_utt
+    # make y tensor of shape (B x U_max)
+    # populate it initially with all 'V' numbers so that the 'V's will remain in the areas that
+    # are 'padding'. This will be useful for when we make 'log_probs_reorderd' during Viterbi decoding
+    # in a different function.
+    y_batch = V * torch.ones((B, U_max), dtype=torch.int64)
+    for b, y_utt in enumerate(y_list_batch):
+        U_utt = U_batch[b]
+        y_batch[b, :U_utt] = torch.tensor(y_utt)
+    # calculate output_timestep_duration if it is None
+    if output_timestep_duration is None:
+        if not 'window_stride' in model.cfg.preprocessor:
+            raise ValueError(
+                "Don't have attribute 'window_stride' in 'model.cfg.preprocessor' => cannot calculate "
+                " model_downsample_factor => stopping process"
+            )
+        if not 'sample_rate' in model.cfg.preprocessor:
+            raise ValueError(
+                "Don't have attribute 'sample_rate' in 'model.cfg.preprocessor' => cannot calculate start "
+                " and end time of segments => stopping process"
+            )
+        with sf.SoundFile(audio_filepaths_batch[0]) as f:
+            audio_dur = f.frames / f.samplerate
+        n_input_frames = audio_dur / model.cfg.preprocessor.window_stride
+        model_downsample_factor = round(n_input_frames / int(T_batch[0]))
+        output_timestep_duration = (
+            model.preprocessor.featurizer.hop_length * model_downsample_factor / model.cfg.preprocessor.sample_rate
+        )
+        logging.info(
+            f"Calculated that the model downsample factor is {model_downsample_factor}"
+            f" and therefore the ASR model output timestep duration is {output_timestep_duration}"
+            " -- will use this for all batches"
+        )
+    return (
+        log_probs_batch,
+        y_batch,
+        T_batch,
+        U_batch,
+        utt_obj_batch,
+        output_timestep_duration,
+    )

utils/make_ass_files.py ADDED Viewed

	@@ -0,0 +1,462 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains functions for make ASS-format subtitle files based on the generated alignment.
+ASS files can be generated highlighting token-level alignments or word-level alignments.
+In both cases, 'segment' boundaries will be used to determine which parts of the text will appear
+at the same time.
+For the token-level ASS files, the text will be highlighted token-by-token, with the timings determined
+by the NFA alignments.
+For the word-level ASS files, the text will be highlighted word-by-word, with the timings determined
+by the NFA alignemtns.
+"""
+import os
+from utils.constants import BLANK_TOKEN, SPACE_TOKEN
+from utils.data_prep import Segment, Token, Word
+PLAYERRESX = 384
+PLAYERRESY = 288
+MARGINL = 10
+MARGINR = 10
+MARGINV = 20
+def seconds_to_ass_format(seconds_float):
+    seconds_float = float(seconds_float)
+    mm, ss_decimals = divmod(seconds_float, 60)
+    hh, mm = divmod(mm, 60)
+    hh = str(round(hh))
+    if len(hh) == 1:
+        hh = '0' + hh
+    mm = str(round(mm))
+    if len(mm) == 1:
+        mm = '0' + mm
+    ss_decimals = f"{ss_decimals:.2f}"
+    if len(ss_decimals.split(".")[0]) == 1:
+        ss_decimals = "0" + ss_decimals
+    srt_format_time = f"{hh}:{mm}:{ss_decimals}"
+    return srt_format_time
+def rgb_list_to_hex_bgr(rgb_list):
+    r, g, b = rgb_list
+    return f"{b:x}{g:x}{r:x}"
+def make_ass_files(
+    utt_obj, output_dir_root, ass_file_config,
+):
+    # don't try to make files if utt_obj.segments_and_tokens is empty, which will happen
+    # in the case of the ground truth text being empty or the number of tokens being too large vs audio duration
+    if not utt_obj.segments_and_tokens:
+        return utt_obj
+    if ass_file_config.resegment_text_to_fill_space:
+        utt_obj = resegment_utt_obj(utt_obj, ass_file_config)
+    utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config,)
+    utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config,)
+    return utt_obj
+def _get_word_n_chars(word):
+    n_chars = 0
+    for token in word.tokens:
+        if token.text != BLANK_TOKEN:
+            n_chars += len(token.text)
+    return n_chars
+def _get_segment_n_chars(segment):
+    n_chars = 0
+    for word_or_token in segment.words_and_tokens:
+        if word_or_token.text == SPACE_TOKEN:
+            n_chars += 1
+        elif word_or_token.text != BLANK_TOKEN:
+            n_chars += len(word_or_token.text)
+    return n_chars
+def resegment_utt_obj(utt_obj, ass_file_config):
+    # get list of just all words and tokens
+    all_words_and_tokens = []
+    for segment_or_token in utt_obj.segments_and_tokens:
+        if type(segment_or_token) is Segment:
+            all_words_and_tokens.extend(segment_or_token.words_and_tokens)
+        else:
+            all_words_and_tokens.append(segment_or_token)
+    # figure out how many chars will fit into one 'slide' and thus should be the max
+    # size of a segment
+    approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / (
+        ass_file_config.fontsize * 0.6
+    )  # assume chars 0.6 as wide as they are tall
+    approx_lines_per_segment = (PLAYERRESY - MARGINV) / (
+        ass_file_config.fontsize * 1.15
+    )  # assume line spacing is 1.15
+    if approx_lines_per_segment > ass_file_config.max_lines_per_segment:
+        approx_lines_per_segment = ass_file_config.max_lines_per_segment
+    max_chars_per_segment = int(approx_chars_per_line * approx_lines_per_segment)
+    new_segments_and_tokens = []
+    all_words_and_tokens_pointer = 0
+    for word_or_token in all_words_and_tokens:
+        if type(word_or_token) is Token:
+            new_segments_and_tokens.append(word_or_token)
+            all_words_and_tokens_pointer += 1
+        else:
+            break
+    new_segments_and_tokens.append(Segment())
+    while all_words_and_tokens_pointer < len(all_words_and_tokens):
+        word_or_token = all_words_and_tokens[all_words_and_tokens_pointer]
+        if type(word_or_token) is Word:
+            # if this is going to be the first word in the segment, we definitely want
+            # to add it to the segment
+            if not new_segments_and_tokens[-1].words_and_tokens:
+                new_segments_and_tokens[-1].words_and_tokens.append(word_or_token)
+            else:
+                # if not the first word, check what the new length of the segment will be
+                # if short enough - add this word to this segment;
+                # if too long - add to a new segment
+                this_word_n_chars = _get_word_n_chars(word_or_token)
+                segment_so_far_n_chars = _get_segment_n_chars(new_segments_and_tokens[-1])
+                if this_word_n_chars + segment_so_far_n_chars < max_chars_per_segment:
+                    new_segments_and_tokens[-1].words_and_tokens.append(word_or_token)
+                else:
+                    new_segments_and_tokens.append(Segment())
+                    new_segments_and_tokens[-1].words_and_tokens.append(word_or_token)
+        else:  # i.e. word_or_token is a token
+            # currently this breaks the convention of tokens at the end/beginning
+            # of segments being listed as separate tokens in segment.word_and_tokens
+            # TODO: change code so we follow this convention
+            new_segments_and_tokens[-1].words_and_tokens.append(word_or_token)
+        all_words_and_tokens_pointer += 1
+    utt_obj.segments_and_tokens = new_segments_and_tokens
+    return utt_obj
+def make_word_level_ass_file(
+    utt_obj, output_dir_root, ass_file_config,
+):
+    default_style_dict = {
+        "Name": "Default",
+        "Fontname": "Arial",
+        "Fontsize": str(ass_file_config.fontsize),
+        "PrimaryColour": "&Hffffff",
+        "SecondaryColour": "&Hffffff",
+        "OutlineColour": "&H0",
+        "BackColour": "&H0",
+        "Bold": "0",
+        "Italic": "0",
+        "Underline": "0",
+        "StrikeOut": "0",
+        "ScaleX": "100",
+        "ScaleY": "100",
+        "Spacing": "0",
+        "Angle": "0",
+        "BorderStyle": "1",
+        "Outline": "1",
+        "Shadow": "0",
+        "Alignment": None,  # will specify below
+        "MarginL": str(MARGINL),
+        "MarginR": str(MARGINR),
+        "MarginV": str(MARGINV),
+        "Encoding": "0",
+    }
+    if ass_file_config.vertical_alignment == "top":
+        default_style_dict["Alignment"] = "8"  # text will be 'center-justified' and in the top of the screen
+    elif ass_file_config.vertical_alignment == "center":
+        default_style_dict["Alignment"] = "5"  # text will be 'center-justified' and in the middle of the screen
+    elif ass_file_config.vertical_alignment == "bottom":
+        default_style_dict["Alignment"] = "2"  # text will be 'center-justified' and in the bottom of the screen
+    else:
+        raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment")
+    output_dir = os.path.join(output_dir_root, "ass", "words")
+    os.makedirs(output_dir, exist_ok=True)
+    output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass")
+    already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}"
+    being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}"
+    not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}"
+    with open(output_file, 'w') as f:
+        default_style_top_line = "Format: " + ", ".join(default_style_dict.keys())
+        default_style_bottom_line = "Style: " + ",".join(default_style_dict.values())
+        f.write(
+            (
+                "[Script Info]\n"
+                "ScriptType: v4.00+\n"
+                f"PlayResX: {PLAYERRESX}\n"
+                f"PlayResY: {PLAYERRESY}\n"
+                "\n"
+                "[V4+ Styles]\n"
+                f"{default_style_top_line}\n"
+                f"{default_style_bottom_line}\n"
+                "\n"
+                "[Events]\n"
+                "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n"
+            )
+        )
+        # write first set of subtitles for text before speech starts to be spoken
+        words_in_first_segment = []
+        for segment_or_token in utt_obj.segments_and_tokens:
+            if type(segment_or_token) is Segment:
+                first_segment = segment_or_token
+                for word_or_token in first_segment.words_and_tokens:
+                    if type(word_or_token) is Word:
+                        words_in_first_segment.append(word_or_token)
+                break
+        text_before_speech = not_yet_spoken_color_code + " ".join([x.text for x in words_in_first_segment]) + r"{\r}"
+        subtitle_text = (
+            f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,,"
+            + text_before_speech.rstrip()
+        )
+        f.write(subtitle_text + '\n')
+        for segment_or_token in utt_obj.segments_and_tokens:
+            if type(segment_or_token) is Segment:
+                segment = segment_or_token
+                words_in_segment = []
+                for word_or_token in segment.words_and_tokens:
+                    if type(word_or_token) is Word:
+                        words_in_segment.append(word_or_token)
+                for word_i, word in enumerate(words_in_segment):
+                    text_before = " ".join([x.text for x in words_in_segment[:word_i]])
+                    if text_before != "":
+                        text_before += " "
+                    text_before = already_spoken_color_code + text_before + r"{\r}"
+                    if word_i < len(words_in_segment) - 1:
+                        text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]])
+                    else:
+                        text_after = ""
+                    text_after = not_yet_spoken_color_code + text_after + r"{\r}"
+                    aligned_text = being_spoken_color_code + word.text + r"{\r}"
+                    aligned_text_off = already_spoken_color_code + word.text + r"{\r}"
+                    subtitle_text = (
+                        f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,,"
+                        + text_before
+                        + aligned_text
+                        + text_after.rstrip()
+                    )
+                    f.write(subtitle_text + '\n')
+                    # add subtitles without word-highlighting for when words are not being spoken
+                    if word_i < len(words_in_segment) - 1:
+                        last_word_end = float(words_in_segment[word_i].t_end)
+                        next_word_start = float(words_in_segment[word_i + 1].t_start)
+                        if next_word_start - last_word_end > 0.001:
+                            subtitle_text = (
+                                f"Dialogue: 0,{seconds_to_ass_format(last_word_end)},{seconds_to_ass_format(next_word_start)},Default,,0,0,0,,"
+                                + text_before
+                                + aligned_text_off
+                                + text_after.rstrip()
+                            )
+                            f.write(subtitle_text + '\n')
+    utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file
+    return utt_obj
+def make_token_level_ass_file(
+    utt_obj, output_dir_root, ass_file_config,
+):
+    default_style_dict = {
+        "Name": "Default",
+        "Fontname": "Arial",
+        "Fontsize": str(ass_file_config.fontsize),
+        "PrimaryColour": "&Hffffff",
+        "SecondaryColour": "&Hffffff",
+        "OutlineColour": "&H0",
+        "BackColour": "&H0",
+        "Bold": "0",
+        "Italic": "0",
+        "Underline": "0",
+        "StrikeOut": "0",
+        "ScaleX": "100",
+        "ScaleY": "100",
+        "Spacing": "0",
+        "Angle": "0",
+        "BorderStyle": "1",
+        "Outline": "1",
+        "Shadow": "0",
+        "Alignment": None,  # will specify below
+        "MarginL": str(MARGINL),
+        "MarginR": str(MARGINR),
+        "MarginV": str(MARGINV),
+        "Encoding": "0",
+    }
+    if ass_file_config.vertical_alignment == "top":
+        default_style_dict["Alignment"] = "8"  # text will be 'center-justified' and in the top of the screen
+    elif ass_file_config.vertical_alignment == "center":
+        default_style_dict["Alignment"] = "5"  # text will be 'center-justified' and in the middle of the screen
+    elif ass_file_config.vertical_alignment == "bottom":
+        default_style_dict["Alignment"] = "2"  # text will be 'center-justified' and in the bottom of the screen
+    else:
+        raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment")
+    output_dir = os.path.join(output_dir_root, "ass", "tokens")
+    os.makedirs(output_dir, exist_ok=True)
+    output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass")
+    already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}"
+    being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}"
+    not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}"
+    with open(output_file, 'w') as f:
+        default_style_top_line = "Format: " + ", ".join(default_style_dict.keys())
+        default_style_bottom_line = "Style: " + ",".join(default_style_dict.values())
+        f.write(
+            (
+                "[Script Info]\n"
+                "ScriptType: v4.00+\n"
+                f"PlayResX: {PLAYERRESX}\n"
+                f"PlayResY: {PLAYERRESY}\n"
+                "ScaledBorderAndShadow: yes\n"
+                "\n"
+                "[V4+ Styles]\n"
+                f"{default_style_top_line}\n"
+                f"{default_style_bottom_line}\n"
+                "\n"
+                "[Events]\n"
+                "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n"
+            )
+        )
+        # write first set of subtitles for text before speech starts to be spoken
+        tokens_in_first_segment = []
+        for segment_or_token in utt_obj.segments_and_tokens:
+            if type(segment_or_token) is Segment:
+                for word_or_token in segment_or_token.words_and_tokens:
+                    if type(word_or_token) is Token:
+                        if word_or_token.text != BLANK_TOKEN:
+                            tokens_in_first_segment.append(word_or_token)
+                    else:
+                        for token in word_or_token.tokens:
+                            if token.text != BLANK_TOKEN:
+                                tokens_in_first_segment.append(token)
+                break
+        for token in tokens_in_first_segment:
+            token.text_cased = token.text_cased.replace(
+                "▁", " "
+            )  # replace underscores used in subword tokens with spaces
+            token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ")  # space token with actual space
+        text_before_speech = (
+            not_yet_spoken_color_code + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}"
+        )
+        subtitle_text = (
+            f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,,"
+            + text_before_speech.rstrip()
+        )
+        f.write(subtitle_text + '\n')
+        for segment_or_token in utt_obj.segments_and_tokens:
+            if type(segment_or_token) is Segment:
+                segment = segment_or_token
+                tokens_in_segment = []  # make list of (non-blank) tokens
+                for word_or_token in segment.words_and_tokens:
+                    if type(word_or_token) is Token:
+                        if word_or_token.text != BLANK_TOKEN:
+                            tokens_in_segment.append(word_or_token)
+                    else:
+                        for token in word_or_token.tokens:
+                            if token.text != BLANK_TOKEN:
+                                tokens_in_segment.append(token)
+                for token in tokens_in_segment:
+                    token.text_cased = token.text_cased.replace(
+                        "▁", " "
+                    )  # replace underscores used in subword tokens with spaces
+                    token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ")  # space token with actual space
+                for token_i, token in enumerate(tokens_in_segment):
+                    text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]])
+                    text_before = already_spoken_color_code + text_before + r"{\r}"
+                    if token_i < len(tokens_in_segment) - 1:
+                        text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]])
+                    else:
+                        text_after = ""
+                    text_after = not_yet_spoken_color_code + text_after + r"{\r}"
+                    aligned_text = being_spoken_color_code + token.text_cased + r"{\r}"
+                    aligned_text_off = already_spoken_color_code + token.text_cased + r"{\r}"
+                    subtitle_text = (
+                        f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,,"
+                        + text_before
+                        + aligned_text
+                        + text_after.rstrip()
+                    )
+                    f.write(subtitle_text + '\n')
+                    # add subtitles without word-highlighting for when words are not being spoken
+                    if token_i < len(tokens_in_segment) - 1:
+                        last_token_end = float(tokens_in_segment[token_i].t_end)
+                        next_token_start = float(tokens_in_segment[token_i + 1].t_start)
+                        if next_token_start - last_token_end > 0.001:
+                            subtitle_text = (
+                                f"Dialogue: 0,{seconds_to_ass_format(last_token_end)},{seconds_to_ass_format(next_token_start)},Default,,0,0,0,,"
+                                + text_before
+                                + aligned_text_off
+                                + text_after.rstrip()
+                            )
+                            f.write(subtitle_text + '\n')
+    utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file
+    return utt_obj

utils/make_ctm_files.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import soundfile as sf
+from utils.constants import BLANK_TOKEN, SPACE_TOKEN
+from utils.data_prep import Segment, Word
+def make_ctm_files(
+    utt_obj, output_dir_root, ctm_file_config,
+):
+    """
+    Function to save CTM files for all the utterances in the incoming batch.
+    """
+    # don't try to make files if utt_obj.segments_and_tokens is empty, which will happen
+    # in the case of the ground truth text being empty or the number of tokens being too large vs audio duration
+    if not utt_obj.segments_and_tokens:
+        return utt_obj
+    # get audio file duration if we will need it later
+    if ctm_file_config.minimum_timestamp_duration > 0:
+        with sf.SoundFile(utt_obj.audio_filepath) as f:
+            audio_file_duration = f.frames / f.samplerate
+    else:
+        audio_file_duration = None
+    utt_obj = make_ctm("tokens", utt_obj, output_dir_root, audio_file_duration, ctm_file_config,)
+    utt_obj = make_ctm("words", utt_obj, output_dir_root, audio_file_duration, ctm_file_config,)
+    utt_obj = make_ctm("segments", utt_obj, output_dir_root, audio_file_duration, ctm_file_config,)
+    return utt_obj
+def make_ctm(
+    alignment_level, utt_obj, output_dir_root, audio_file_duration, ctm_file_config,
+):
+    output_dir = os.path.join(output_dir_root, "ctm", alignment_level)
+    os.makedirs(output_dir, exist_ok=True)
+    boundary_info_utt = []
+    for segment_or_token in utt_obj.segments_and_tokens:
+        if type(segment_or_token) is Segment:
+            segment = segment_or_token
+            if alignment_level == "segments":
+                boundary_info_utt.append(segment)
+            for word_or_token in segment.words_and_tokens:
+                if type(word_or_token) is Word:
+                    word = word_or_token
+                    if alignment_level == "words":
+                        boundary_info_utt.append(word)
+                    for token in word.tokens:
+                        if alignment_level == "tokens":
+                            boundary_info_utt.append(token)
+                else:
+                    token = word_or_token
+                    if alignment_level == "tokens":
+                        boundary_info_utt.append(token)
+        else:
+            token = segment_or_token
+            if alignment_level == "tokens":
+                boundary_info_utt.append(token)
+    with open(os.path.join(output_dir, f"{utt_obj.utt_id}.ctm"), "w") as f_ctm:
+        for boundary_info_ in boundary_info_utt:  # loop over every token/word/segment
+            # skip if t_start = t_end = negative number because we used it as a marker to skip some blank tokens
+            if not (boundary_info_.t_start < 0 or boundary_info_.t_end < 0):
+                text = boundary_info_.text
+                start_time = boundary_info_.t_start
+                end_time = boundary_info_.t_end
+                if (
+                    ctm_file_config.minimum_timestamp_duration > 0
+                    and ctm_file_config.minimum_timestamp_duration > end_time - start_time
+                ):
+                    # make the predicted duration of the token/word/segment longer, growing it outwards equal
+                    # amounts from the predicted center of the token/word/segment
+                    token_mid_point = (start_time + end_time) / 2
+                    start_time = max(token_mid_point - ctm_file_config.minimum_timestamp_duration / 2, 0)
+                    end_time = min(
+                        token_mid_point + ctm_file_config.minimum_timestamp_duration / 2, audio_file_duration
+                    )
+                if not (
+                    text == BLANK_TOKEN and ctm_file_config.remove_blank_tokens
+                ):  # don't save blanks if we don't want to
+                    # replace any spaces with <space> so we dont introduce extra space characters to our CTM files
+                    text = text.replace(" ", SPACE_TOKEN)
+                    f_ctm.write(f"{utt_obj.utt_id} 1 {start_time:.2f} {end_time - start_time:.2f} {text}\n")
+    utt_obj.saved_output_files[f"{alignment_level}_level_ctm_filepath"] = os.path.join(
+        output_dir, f"{utt_obj.utt_id}.ctm"
+    )
+    return utt_obj

utils/make_output_manifest.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+def write_manifest_out_line(
+    f_manifest_out, utt_obj,
+):
+    data = {"audio_filepath": utt_obj.audio_filepath}
+    if not utt_obj.text is None:
+        data["text"] = utt_obj.text
+    if not utt_obj.pred_text is None:
+        data["pred_text"] = utt_obj.pred_text
+    for key, val in utt_obj.saved_output_files.items():
+        data[key] = val
+    new_line = json.dumps(data)
+    f_manifest_out.write(f"{new_line}\n")
+    return None

utils/viterbi_decoding.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from utils.constants import V_NEGATIVE_NUM
+def viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device):
+    """
+    Do Viterbi decoding with an efficient algorithm (the only for-loop in the 'forward pass' is over the time dimension).
+    Args:
+        log_probs_batch: tensor of shape (B, T_max, V). The parts of log_probs_batch which are 'padding' are filled
+            with 'V_NEGATIVE_NUM' - a large negative number which represents a very low probability.
+        y_batch: tensor of shape (B, U_max) - contains token IDs including blanks in every other position. The parts of
+            y_batch which are padding are filled with the number 'V'. V = the number of tokens in the vocabulary + 1 for
+            the blank token.
+        T_batch: tensor of shape (B, 1) - contains the durations of the log_probs_batch (so we can ignore the
+            parts of log_probs_batch which are padding)
+        U_batch: tensor of shape (B, 1) - contains the lengths of y_batch (so we can ignore the parts of y_batch
+            which are padding).
+        viterbi_device: the torch device on which Viterbi decoding will be done.
+    Returns:
+        alignments_batch: list of lists containing locations for the tokens we align to at each timestep.
+            Looks like: [[0, 0, 1, 2, 2, 3, 3, ...,  ], ..., [0, 1, 2, 2, 2, 3, 4, ....]].
+            Each list inside alignments_batch is of length T_batch[location of utt in batch].
+    """
+    B, T_max, _ = log_probs_batch.shape
+    U_max = y_batch.shape[1]
+    # transfer all tensors to viterbi_device
+    log_probs_batch = log_probs_batch.to(viterbi_device)
+    y_batch = y_batch.to(viterbi_device)
+    T_batch = T_batch.to(viterbi_device)
+    U_batch = U_batch.to(viterbi_device)
+    # make tensor that we will put at timesteps beyond the duration of the audio
+    padding_for_log_probs = V_NEGATIVE_NUM * torch.ones((B, T_max, 1), device=viterbi_device)
+    # make log_probs_padded tensor of shape (B, T_max, V +1 ) where all of
+    # log_probs_padded[:,:,-1] is the 'V_NEGATIVE_NUM'
+    log_probs_padded = torch.cat((log_probs_batch, padding_for_log_probs), dim=2)
+    # initialize v_prev - tensor of previous timestep's viterbi probabilies, of shape (B, U_max)
+    v_prev = V_NEGATIVE_NUM * torch.ones((B, U_max), device=viterbi_device)
+    v_prev[:, :2] = torch.gather(input=log_probs_padded[:, 0, :], dim=1, index=y_batch[:, :2])
+    # initialize backpointers_rel - which contains values like 0 to indicate the backpointer is to the same u index,
+    # 1 to indicate the backpointer pointing to the u-1 index and 2 to indicate the backpointer is pointing to the u-2 index
+    backpointers_rel = -99 * torch.ones((B, T_max, U_max), dtype=torch.int8, device=viterbi_device)
+    # Make a letter_repetition_mask the same shape as y_batch
+    # the letter_repetition_mask will have 'True' where the token (including blanks) is the same
+    # as the token two places before it in the ground truth (and 'False everywhere else).
+    # We will use letter_repetition_mask to determine whether the Viterbi algorithm needs to look two tokens back or
+    # three tokens back
+    y_shifted_left = torch.roll(y_batch, shifts=2, dims=1)
+    letter_repetition_mask = y_batch - y_shifted_left
+    letter_repetition_mask[:, :2] = 1  # make sure dont apply mask to first 2 tokens
+    letter_repetition_mask = letter_repetition_mask == 0
+    for t in range(1, T_max):
+        # e_current is a tensor of shape (B, U_max) of the log probs of every possible token at the current timestep
+        e_current = torch.gather(input=log_probs_padded[:, t, :], dim=1, index=y_batch)
+        # apply a mask to e_current to cope with the fact that we do not keep the whole v_matrix and continue
+        # calculating viterbi probabilities during some 'padding' timesteps
+        t_exceeded_T_batch = t >= T_batch
+        U_can_be_final = torch.logical_or(
+            torch.arange(0, U_max, device=viterbi_device).unsqueeze(0) == (U_batch.unsqueeze(1) - 0),
+            torch.arange(0, U_max, device=viterbi_device).unsqueeze(0) == (U_batch.unsqueeze(1) - 1),
+        )
+        mask = torch.logical_not(torch.logical_and(t_exceeded_T_batch.unsqueeze(1), U_can_be_final,)).long()
+        e_current = e_current * mask
+        # v_prev_shifted is a tensor of shape (B, U_max) of the viterbi probabilities 1 timestep back and 1 token position back
+        v_prev_shifted = torch.roll(v_prev, shifts=1, dims=1)
+        # by doing a roll shift of size 1, we have brought the viterbi probability in the final token position to the
+        # first token position - let's overcome this by 'zeroing out' the probabilities in the firest token position
+        v_prev_shifted[:, 0] = V_NEGATIVE_NUM
+        # v_prev_shifted2 is a tensor of shape (B, U_max) of the viterbi probabilities 1 timestep back and 2 token position back
+        v_prev_shifted2 = torch.roll(v_prev, shifts=2, dims=1)
+        v_prev_shifted2[:, :2] = V_NEGATIVE_NUM  # zero out as we did for v_prev_shifted
+        # use our letter_repetition_mask to remove the connections between 2 blanks (so we don't skip over a letter)
+        # and to remove the connections between 2 consective letters (so we don't skip over a blank)
+        v_prev_shifted2.masked_fill_(letter_repetition_mask, V_NEGATIVE_NUM)
+        # we need this v_prev_dup tensor so we can calculated the viterbi probability of every possible
+        # token position simultaneously
+        v_prev_dup = torch.cat(
+            (v_prev.unsqueeze(2), v_prev_shifted.unsqueeze(2), v_prev_shifted2.unsqueeze(2),), dim=2,
+        )
+        # candidates_v_current are our candidate viterbi probabilities for every token position, from which
+        # we will pick the max and record the argmax
+        candidates_v_current = v_prev_dup + e_current.unsqueeze(2)
+        # we straight away save results in v_prev instead of v_current, so that the variable v_prev will be ready for the
+        # next iteration of the for-loop
+        v_prev, bp_relative = torch.max(candidates_v_current, dim=2)
+        backpointers_rel[:, t, :] = bp_relative
+    # trace backpointers
+    alignments_batch = []
+    for b in range(B):
+        T_b = int(T_batch[b])
+        U_b = int(U_batch[b])
+        if U_b == 1:  # i.e. we put only a blank token in the reference text because the reference text is empty
+            current_u = 0  # set initial u to 0 and let the rest of the code block run as usual
+        else:
+            current_u = int(torch.argmax(v_prev[b, U_b - 2 : U_b])) + U_b - 2
+        alignment_b = [current_u]
+        for t in range(T_max - 1, 0, -1):
+            current_u = current_u - int(backpointers_rel[b, t, current_u])
+            alignment_b.insert(0, current_u)
+        alignment_b = alignment_b[:T_b]
+        alignments_batch.append(alignment_b)
+    return alignments_batch