# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This file contains functions for make ASS-format subtitle files based on the generated alignment. ASS files can be generated highlighting token-level alignments or word-level alignments. In both cases, 'segment' boundaries will be used to determine which parts of the text will appear at the same time. For the token-level ASS files, the text will be highlighted token-by-token, with the timings determined by the NFA alignments. For the word-level ASS files, the text will be highlighted word-by-word, with the timings determined by the NFA alignemtns. """ import math import os import soundfile as sf from utils.constants import BLANK_TOKEN, SPACE_TOKEN from utils.data_prep import Segment, Token, Word PLAYERRESX = 384 PLAYERRESY = 288 MARGINL = 10 MARGINR = 10 MARGINV = 20 def seconds_to_ass_format(seconds_float): seconds_float = float(seconds_float) mm, ss_decimals = divmod(seconds_float, 60) hh, mm = divmod(mm, 60) hh = str(round(hh)) if len(hh) == 1: hh = '0' + hh mm = str(round(mm)) if len(mm) == 1: mm = '0' + mm ss_decimals = f"{ss_decimals:.2f}" if len(ss_decimals.split(".")[0]) == 1: ss_decimals = "0" + ss_decimals srt_format_time = f"{hh}:{mm}:{ss_decimals}" return srt_format_time def rgb_list_to_hex_bgr(rgb_list): r, g, b = rgb_list return f"{b:x}{g:x}{r:x}" def make_ass_files( utt_obj, output_dir_root, ass_file_config, ): # don't try to make files if utt_obj.segments_and_tokens is empty, which will happen # in the case of the ground truth text being empty or the number of tokens being too large vs audio duration if not utt_obj.segments_and_tokens: return utt_obj if ass_file_config.resegment_text_to_fill_space: utt_obj = resegment_utt_obj(utt_obj, ass_file_config) # get duration of the utterance, so we know the final timestamp of the final set of subtitles, # which we will keep showing until the end with sf.SoundFile(utt_obj.audio_filepath) as f: audio_dur = f.frames / f.samplerate utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur) utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur) return utt_obj def _get_word_n_chars(word): n_chars = 0 for token in word.tokens: if token.text != BLANK_TOKEN: n_chars += len(token.text) return n_chars def _get_segment_n_chars(segment): n_chars = 0 for word_or_token in segment.words_and_tokens: if word_or_token.text == SPACE_TOKEN: n_chars += 1 elif word_or_token.text != BLANK_TOKEN: n_chars += len(word_or_token.text) return n_chars def resegment_utt_obj(utt_obj, ass_file_config): # get list of just all words and tokens all_words_and_tokens = [] for segment_or_token in utt_obj.segments_and_tokens: if type(segment_or_token) is Segment: all_words_and_tokens.extend(segment_or_token.words_and_tokens) else: all_words_and_tokens.append(segment_or_token) # figure out how many chars will fit into one 'slide' and thus should be the max # size of a segment approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / ( ass_file_config.fontsize * 0.6 ) # assume chars 0.6 as wide as they are tall approx_lines_per_segment = (PLAYERRESY - MARGINV) / ( ass_file_config.fontsize * 1.15 ) # assume line spacing is 1.15 if approx_lines_per_segment > ass_file_config.max_lines_per_segment: approx_lines_per_segment = ass_file_config.max_lines_per_segment max_chars_per_segment = int(approx_chars_per_line * approx_lines_per_segment) new_segments_and_tokens = [] all_words_and_tokens_pointer = 0 for word_or_token in all_words_and_tokens: if type(word_or_token) is Token: new_segments_and_tokens.append(word_or_token) all_words_and_tokens_pointer += 1 else: break new_segments_and_tokens.append(Segment()) while all_words_and_tokens_pointer < len(all_words_and_tokens): word_or_token = all_words_and_tokens[all_words_and_tokens_pointer] if type(word_or_token) is Word: # if this is going to be the first word in the segment, we definitely want # to add it to the segment if not new_segments_and_tokens[-1].words_and_tokens: new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) else: # if not the first word, check what the new length of the segment will be # if short enough - add this word to this segment; # if too long - add to a new segment this_word_n_chars = _get_word_n_chars(word_or_token) segment_so_far_n_chars = _get_segment_n_chars(new_segments_and_tokens[-1]) if this_word_n_chars + segment_so_far_n_chars < max_chars_per_segment: new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) else: new_segments_and_tokens.append(Segment()) new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) else: # i.e. word_or_token is a token # currently this breaks the convention of tokens at the end/beginning # of segments being listed as separate tokens in segment.word_and_tokens # TODO: change code so we follow this convention new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) all_words_and_tokens_pointer += 1 utt_obj.segments_and_tokens = new_segments_and_tokens return utt_obj def make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur): default_style_dict = { "Name": "Default", "Fontname": "Arial", "Fontsize": str(ass_file_config.fontsize), "PrimaryColour": "&Hffffff", "SecondaryColour": "&Hffffff", "OutlineColour": "&H0", "BackColour": "&H0", "Bold": "0", "Italic": "0", "Underline": "0", "StrikeOut": "0", "ScaleX": "100", "ScaleY": "100", "Spacing": "0", "Angle": "0", "BorderStyle": "1", "Outline": "1", "Shadow": "0", "Alignment": None, # will specify below "MarginL": str(MARGINL), "MarginR": str(MARGINR), "MarginV": str(MARGINV), "Encoding": "0", } if ass_file_config.vertical_alignment == "top": default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen elif ass_file_config.vertical_alignment == "center": default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen elif ass_file_config.vertical_alignment == "bottom": default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen else: raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") output_dir = os.path.join(output_dir_root, "ass", "words") os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" with open(output_file, 'w') as f: default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) f.write( ( "[Script Info]\n" "ScriptType: v4.00+\n" f"PlayResX: {PLAYERRESX}\n" f"PlayResY: {PLAYERRESY}\n" "\n" "[V4+ Styles]\n" f"{default_style_top_line}\n" f"{default_style_bottom_line}\n" "\n" "[Events]\n" "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" ) ) # write first set of subtitles for text before speech starts to be spoken words_in_first_segment = [] for segment_or_token in utt_obj.segments_and_tokens: if type(segment_or_token) is Segment: first_segment = segment_or_token for word_or_token in first_segment.words_and_tokens: if type(word_or_token) is Word: words_in_first_segment.append(word_or_token) break text_before_speech = not_yet_spoken_color_code + " ".join([x.text for x in words_in_first_segment]) + r"{\r}" subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,," + text_before_speech.rstrip() ) f.write(subtitle_text + '\n') for segment_or_token in utt_obj.segments_and_tokens: if type(segment_or_token) is Segment: segment = segment_or_token words_in_segment = [] for word_or_token in segment.words_and_tokens: if type(word_or_token) is Word: words_in_segment.append(word_or_token) for word_i, word in enumerate(words_in_segment): text_before = " ".join([x.text for x in words_in_segment[:word_i]]) if text_before != "": text_before += " " text_before = already_spoken_color_code + text_before + r"{\r}" if word_i < len(words_in_segment) - 1: text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]]) else: text_after = "" text_after = not_yet_spoken_color_code + text_after + r"{\r}" aligned_text = being_spoken_color_code + word.text + r"{\r}" aligned_text_off = already_spoken_color_code + word.text + r"{\r}" subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,," + text_before + aligned_text + text_after.rstrip() ) f.write(subtitle_text + '\n') # add subtitles without word-highlighting for when words are not being spoken if word_i < len(words_in_segment) - 1: last_word_end = float(words_in_segment[word_i].t_end) next_word_start = float(words_in_segment[word_i + 1].t_start) if next_word_start - last_word_end > 0.001: subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(last_word_end)},{seconds_to_ass_format(next_word_start)},Default,,0,0,0,," + text_before + aligned_text_off + text_after.rstrip() ) f.write(subtitle_text + '\n') # write final set of subtitles for text after speech has been spoken words_in_final_segment = [] for segment_or_token in utt_obj.segments_and_tokens[::-1]: if type(segment_or_token) is Segment: final_segment = segment_or_token for word_or_token in final_segment.words_and_tokens: if type(word_or_token) is Word: words_in_final_segment.append(word_or_token) break text_after_speech = already_spoken_color_code + " ".join([x.text for x in words_in_final_segment]) + r"{\r}" # note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become # longer than the original audio during the MP4 creation stage. subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(words_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,," + text_after_speech.rstrip() ) f.write(subtitle_text + '\n') utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file return utt_obj def make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur): default_style_dict = { "Name": "Default", "Fontname": "Arial", "Fontsize": str(ass_file_config.fontsize), "PrimaryColour": "&Hffffff", "SecondaryColour": "&Hffffff", "OutlineColour": "&H0", "BackColour": "&H0", "Bold": "0", "Italic": "0", "Underline": "0", "StrikeOut": "0", "ScaleX": "100", "ScaleY": "100", "Spacing": "0", "Angle": "0", "BorderStyle": "1", "Outline": "1", "Shadow": "0", "Alignment": None, # will specify below "MarginL": str(MARGINL), "MarginR": str(MARGINR), "MarginV": str(MARGINV), "Encoding": "0", } if ass_file_config.vertical_alignment == "top": default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen elif ass_file_config.vertical_alignment == "center": default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen elif ass_file_config.vertical_alignment == "bottom": default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen else: raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") output_dir = os.path.join(output_dir_root, "ass", "tokens") os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" with open(output_file, 'w') as f: default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) f.write( ( "[Script Info]\n" "ScriptType: v4.00+\n" f"PlayResX: {PLAYERRESX}\n" f"PlayResY: {PLAYERRESY}\n" "ScaledBorderAndShadow: yes\n" "\n" "[V4+ Styles]\n" f"{default_style_top_line}\n" f"{default_style_bottom_line}\n" "\n" "[Events]\n" "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" ) ) # write first set of subtitles for text before speech starts to be spoken tokens_in_first_segment = [] for segment_or_token in utt_obj.segments_and_tokens: if type(segment_or_token) is Segment: for word_or_token in segment_or_token.words_and_tokens: if type(word_or_token) is Token: if word_or_token.text != BLANK_TOKEN: tokens_in_first_segment.append(word_or_token) else: for token in word_or_token.tokens: if token.text != BLANK_TOKEN: tokens_in_first_segment.append(token) break for token in tokens_in_first_segment: token.text_cased = token.text_cased.replace( "▁", " " ) # replace underscores used in subword tokens with spaces token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space text_before_speech = ( not_yet_spoken_color_code + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}" ) subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,," + text_before_speech.rstrip() ) f.write(subtitle_text + '\n') for segment_or_token in utt_obj.segments_and_tokens: if type(segment_or_token) is Segment: segment = segment_or_token tokens_in_segment = [] # make list of (non-blank) tokens for word_or_token in segment.words_and_tokens: if type(word_or_token) is Token: if word_or_token.text != BLANK_TOKEN: tokens_in_segment.append(word_or_token) else: for token in word_or_token.tokens: if token.text != BLANK_TOKEN: tokens_in_segment.append(token) for token in tokens_in_segment: token.text_cased = token.text_cased.replace( "▁", " " ) # replace underscores used in subword tokens with spaces token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space for token_i, token in enumerate(tokens_in_segment): text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]]) text_before = already_spoken_color_code + text_before + r"{\r}" if token_i < len(tokens_in_segment) - 1: text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]]) else: text_after = "" text_after = not_yet_spoken_color_code + text_after + r"{\r}" aligned_text = being_spoken_color_code + token.text_cased + r"{\r}" aligned_text_off = already_spoken_color_code + token.text_cased + r"{\r}" subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,," + text_before + aligned_text + text_after.rstrip() ) f.write(subtitle_text + '\n') # add subtitles without word-highlighting for when words are not being spoken if token_i < len(tokens_in_segment) - 1: last_token_end = float(tokens_in_segment[token_i].t_end) next_token_start = float(tokens_in_segment[token_i + 1].t_start) if next_token_start - last_token_end > 0.001: subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(last_token_end)},{seconds_to_ass_format(next_token_start)},Default,,0,0,0,," + text_before + aligned_text_off + text_after.rstrip() ) f.write(subtitle_text + '\n') # Write final set of subtitles for text after speech has been spoken. # To do this, we need to collect 'tokens_in_final_segment' so that we know what the final line is. tokens_in_final_segment = [] for segment_or_token in utt_obj.segments_and_tokens[::-1]: # Collect tokens from final segment - will 'break' so we only look at the final one. if type(segment_or_token) is Segment: # 'segment_or_token' is known to be Segment, which has attribute 'words_and_tokens' for word_or_token in segment_or_token.words_and_tokens: if type(word_or_token) is Token: if word_or_token.text != BLANK_TOKEN: tokens_in_final_segment.append(word_or_token) else: # 'word_or_token' is known to be a Word, which has attribute 'tokens' for token in word_or_token.tokens: if token.text != BLANK_TOKEN: tokens_in_final_segment.append(token) break for token in tokens_in_final_segment: token.text_cased = token.text_cased.replace( "▁", " " ) # replace underscores used in subword tokens with spaces token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space text_after_speech = ( already_spoken_color_code + "".join([x.text_cased for x in tokens_in_final_segment]) + r"{\r}" ) # note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become # longer than the original audio during the MP4 creation stage. subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(tokens_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,," + text_after_speech.rstrip() ) f.write(subtitle_text + '\n') utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file return utt_obj