Spaces:
Running
Running
File size: 3,938 Bytes
63ab978 6f0e822 63ab978 6f0e822 63ab978 6f0e822 63ab978 6f0e822 63ab978 6f0e822 40f2b57 63ab978 6f0e822 63ab978 6f0e822 63ab978 89f36c9 6f0e822 63ab978 6f0e822 63ab978 89f36c9 6f0e822 7e8138f 6f0e822 63ab978 6f0e822 63ab978 29aee3c 63ab978 3048545 92e709c 29aee3c 3048545 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import re
def timeformat_srt(time):
hours = time // 3600
minutes = (time - hours * 3600) // 60
seconds = time - hours * 3600 - minutes * 60
milliseconds = (time - int(time)) * 1000
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
def timeformat_vtt(time):
hours = time // 3600
minutes = (time - hours * 3600) // 60
seconds = time - hours * 3600 - minutes * 60
milliseconds = (time - int(time)) * 1000
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
def write_file(subtitle, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
f.write(subtitle)
def get_srt(segments):
output = ""
for i, segment in enumerate(segments):
output += f"{i + 1}\n"
output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
output += f"{segment['text']}\n\n"
return output
def get_vtt(segments):
output = "WebVTT\n\n"
for i, segment in enumerate(segments):
output += f"{i + 1}\n"
output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
output += f"{segment['text']}\n\n"
return output
def get_txt(segments):
output = ""
for i, segment in enumerate(segments):
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
output += f"{segment['text']}\n"
return output
def parse_srt(file_path):
"""Reads SRT file and returns as dict"""
with open(file_path, 'r', encoding='utf-8') as file:
srt_data = file.read()
data = []
blocks = srt_data.split('\n\n')
for block in blocks:
if block.strip() != '':
lines = block.strip().split('\n')
index = lines[0]
timestamp = lines[1]
sentence = ' '.join(lines[2:])
data.append({
"index": index,
"timestamp": timestamp,
"sentence": sentence
})
return data
def parse_vtt(file_path):
"""Reads WebVTT file and returns as dict"""
with open(file_path, 'r', encoding='utf-8') as file:
webvtt_data = file.read()
data = []
blocks = webvtt_data.split('\n\n')
for block in blocks:
if block.strip() != '' and not block.strip().startswith("WebVTT"):
lines = block.strip().split('\n')
index = lines[0]
timestamp = lines[1]
sentence = ' '.join(lines[2:])
data.append({
"index": index,
"timestamp": timestamp,
"sentence": sentence
})
return data
def get_serialized_srt(dicts):
output = ""
for dic in dicts:
output += f'{dic["index"]}\n'
output += f'{dic["timestamp"]}\n'
output += f'{dic["sentence"]}\n\n'
return output
def get_serialized_vtt(dicts):
output = "WebVTT\n\n"
for dic in dicts:
output += f'{dic["index"]}\n'
output += f'{dic["timestamp"]}\n'
output += f'{dic["sentence"]}\n\n'
return output
def safe_filename(name):
from app import _args
INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
if not _args.colab:
return safe_name
# Truncate the filename if it exceeds the max_length (20)
if len(safe_name) > 20:
file_extension = safe_name.split('.')[-1]
if len(file_extension) + 1 < 20:
truncated_name = safe_name[:20 - len(file_extension) - 1]
safe_name = truncated_name + '.' + file_extension
else:
safe_name = safe_name[:20]
return safe_name
|