erastorgueva-nv commited on
Commit
abb41a8
1 Parent(s): 700a61a

get latest NFA which should ensure subtitles show until end of video

Browse files
Files changed (1) hide show
  1. utils/make_ass_files.py +68 -8
utils/make_ass_files.py CHANGED
@@ -23,7 +23,9 @@ For the word-level ASS files, the text will be highlighted word-by-word, with th
23
  by the NFA alignemtns.
24
  """
25
 
 
26
  import os
 
27
 
28
  from utils.constants import BLANK_TOKEN, SPACE_TOKEN
29
  from utils.data_prep import Segment, Token, Word
@@ -74,8 +76,13 @@ def make_ass_files(
74
  if ass_file_config.resegment_text_to_fill_space:
75
  utt_obj = resegment_utt_obj(utt_obj, ass_file_config)
76
 
77
- utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config,)
78
- utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config,)
 
 
 
 
 
79
 
80
  return utt_obj
81
 
@@ -166,9 +173,7 @@ def resegment_utt_obj(utt_obj, ass_file_config):
166
  return utt_obj
167
 
168
 
169
- def make_word_level_ass_file(
170
- utt_obj, output_dir_root, ass_file_config,
171
- ):
172
 
173
  default_style_dict = {
174
  "Name": "Default",
@@ -298,14 +303,33 @@ def make_word_level_ass_file(
298
  )
299
  f.write(subtitle_text + '\n')
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file
302
 
303
  return utt_obj
304
 
305
 
306
- def make_token_level_ass_file(
307
- utt_obj, output_dir_root, ass_file_config,
308
- ):
309
 
310
  default_style_dict = {
311
  "Name": "Default",
@@ -457,6 +481,42 @@ def make_token_level_ass_file(
457
  )
458
  f.write(subtitle_text + '\n')
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file
461
 
462
  return utt_obj
 
23
  by the NFA alignemtns.
24
  """
25
 
26
+ import math
27
  import os
28
+ import soundfile as sf
29
 
30
  from utils.constants import BLANK_TOKEN, SPACE_TOKEN
31
  from utils.data_prep import Segment, Token, Word
 
76
  if ass_file_config.resegment_text_to_fill_space:
77
  utt_obj = resegment_utt_obj(utt_obj, ass_file_config)
78
 
79
+ # get duration of the utterance, so we know the final timestamp of the final set of subtitles,
80
+ # which we will keep showing until the end
81
+ with sf.SoundFile(utt_obj.audio_filepath) as f:
82
+ audio_dur = f.frames / f.samplerate
83
+
84
+ utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur)
85
+ utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur)
86
 
87
  return utt_obj
88
 
 
173
  return utt_obj
174
 
175
 
176
+ def make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur):
 
 
177
 
178
  default_style_dict = {
179
  "Name": "Default",
 
303
  )
304
  f.write(subtitle_text + '\n')
305
 
306
+ # write final set of subtitles for text after speech has been spoken
307
+ words_in_final_segment = []
308
+ for segment_or_token in utt_obj.segments_and_tokens[::-1]:
309
+ if type(segment_or_token) is Segment:
310
+ final_segment = segment_or_token
311
+
312
+ for word_or_token in final_segment.words_and_tokens:
313
+ if type(word_or_token) is Word:
314
+ words_in_final_segment.append(word_or_token)
315
+ break
316
+
317
+ text_after_speech = already_spoken_color_code + " ".join([x.text for x in words_in_final_segment]) + r"{\r}"
318
+ # note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become
319
+ # longer than the original audio during the MP4 creation stage.
320
+ subtitle_text = (
321
+ f"Dialogue: 0,{seconds_to_ass_format(words_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,,"
322
+ + text_after_speech.rstrip()
323
+ )
324
+
325
+ f.write(subtitle_text + '\n')
326
+
327
  utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file
328
 
329
  return utt_obj
330
 
331
 
332
+ def make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur):
 
 
333
 
334
  default_style_dict = {
335
  "Name": "Default",
 
481
  )
482
  f.write(subtitle_text + '\n')
483
 
484
+ # Write final set of subtitles for text after speech has been spoken.
485
+ # To do this, we need to collect 'tokens_in_final_segment' so that we know what the final line is.
486
+ tokens_in_final_segment = []
487
+ for segment_or_token in utt_obj.segments_and_tokens[::-1]:
488
+ # Collect tokens from final segment - will 'break' so we only look at the final one.
489
+ if type(segment_or_token) is Segment:
490
+ # 'segment_or_token' is known to be Segment, which has attribute 'words_and_tokens'
491
+ for word_or_token in segment_or_token.words_and_tokens:
492
+ if type(word_or_token) is Token:
493
+ if word_or_token.text != BLANK_TOKEN:
494
+ tokens_in_final_segment.append(word_or_token)
495
+ else:
496
+ # 'word_or_token' is known to be a Word, which has attribute 'tokens'
497
+ for token in word_or_token.tokens:
498
+ if token.text != BLANK_TOKEN:
499
+ tokens_in_final_segment.append(token)
500
+ break
501
+
502
+ for token in tokens_in_final_segment:
503
+ token.text_cased = token.text_cased.replace(
504
+ "▁", " "
505
+ ) # replace underscores used in subword tokens with spaces
506
+ token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space
507
+
508
+ text_after_speech = (
509
+ already_spoken_color_code + "".join([x.text_cased for x in tokens_in_final_segment]) + r"{\r}"
510
+ )
511
+ # note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become
512
+ # longer than the original audio during the MP4 creation stage.
513
+ subtitle_text = (
514
+ f"Dialogue: 0,{seconds_to_ass_format(tokens_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,,"
515
+ + text_after_speech.rstrip()
516
+ )
517
+
518
+ f.write(subtitle_text + '\n')
519
+
520
  utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file
521
 
522
  return utt_obj