whisper-webui-translate

Running

App Files Files Community

whisper-webui-translate / src /utils.py

avans06

The translation model is now compatible with the

a14fe5a 25 days ago

raw

history blame contribute delete

13.2 kB

	import re

	import zlib
	from typing import Iterator, TextIO, Union
	import tqdm

	import urllib3
	import unicodedata


	def exact_div(x, y):
	assert x % y == 0
	return x // y


	def str2bool(string):
	str2val = {"True": True, "False": False}
	if string in str2val:
	return str2val[string]
	else:
	raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")


	def optional_int(string):
	return None if string == "None" else int(string)


	def optional_float(string):
	return None if string == "None" else float(string)


	def compression_ratio(text) -> float:
	return len(text) / len(zlib.compress(text.encode("utf-8")))


	def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
	assert seconds >= 0, "non-negative timestamp expected"
	milliseconds = round(seconds * 1000.0)

	hours = milliseconds // 3_600_000
	milliseconds -= hours * 3_600_000

	minutes = milliseconds // 60_000
	milliseconds -= minutes * 60_000

	seconds = milliseconds // 1_000
	milliseconds -= seconds * 1_000

	hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
	return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"


	def write_txt(transcript: Iterator[dict], file: TextIO):
	for segment in transcript:
	print(segment['text'].strip(), file=file, flush=True)


	def write_vtt(transcript: Iterator[dict], file: TextIO,
	maxLineWidth=None, highlight_words: bool = False):
	iterator = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)

	print("WEBVTT\n", file=file)

	for segment in iterator:
	text = segment['text'].replace('-->', '->')

	print(
	f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
	f"{text}\n",
	file=file,
	flush=True,
	)

	def write_srt(transcript: Iterator[dict], file: TextIO,
	maxLineWidth=None, highlight_words: bool = False):
	"""
	Write a transcript to a file in SRT format.
	Example usage:
	from pathlib import Path
	from whisper.utils import write_srt
	result = transcribe(model, audio_path, temperature=temperature, **args)
	# save SRT
	audio_basename = Path(audio_path).stem
	with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
	write_srt(result["segments"], file=srt)
	"""
	iterator = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)

	for i, segment in enumerate(iterator, start=1):
	text = segment['text'].replace('-->', '->')

	# write srt lines
	print(
	f"{i}\n"
	f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
	f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
	f"{text}\n",
	file=file,
	flush=True,
	)

	def write_srt_original(transcript: Iterator[dict], file: TextIO,
	maxLineWidth=None, highlight_words: bool = False, bilingual: bool = False):
	"""
	Write a transcript to a file in SRT format.
	Example usage:
	from pathlib import Path
	from whisper.utils import write_srt
	result = transcribe(model, audio_path, temperature=temperature, **args)
	# save SRT
	audio_basename = Path(audio_path).stem
	with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
	write_srt(result["segments"], file=srt)
	"""
	iterator = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)

	for i, segment in enumerate(iterator, start=1):
	if "original" not in segment:
	continue

	original = segment['original'].replace('-->', '->')

	# write srt lines
	print(
	f"{i}\n"
	f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
	f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}",
	file=file,
	flush=True,
	)

	if original is not None: print(f"{original}",
	file=file,
	flush=True)

	text = segment['text'].replace('-->', '->')
	print(f"{text}\n" if bilingual else "",
	file=file,
	flush=True)

	def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
	for segment in transcript:
	words: list = segment.get('words', [])

	# Append longest speaker ID if available
	segment_longest_speaker = segment.get('longest_speaker', None)

	# Yield the segment as-is or processed
	if len(words) == 0 and (maxLineWidth is None or maxLineWidth < 0) and segment_longest_speaker is None:
	yield segment

	if segment_longest_speaker is not None:
	segment_longest_speaker = segment_longest_speaker.replace("SPEAKER", "S")

	subtitle_start = segment['start']
	subtitle_end = segment['end']
	text = segment['text'].strip()
	text_original = segment['original'].strip() if 'original' in segment else None

	if len(words) == 0:
	# Prepend the longest speaker ID if available
	if segment_longest_speaker is not None:
	text = f"({segment_longest_speaker}) {text}"

	result = {
	'start': subtitle_start,
	'end' : subtitle_end,
	'text' : process_text(text, maxLineWidth)
	}
	if text_original is not None and len(text_original) > 0:
	result.update({'original': process_text(text_original, maxLineWidth)})
	yield result

	# We are done
	continue

	if segment_longest_speaker is not None:
	# Add the beginning
	words.insert(0, {
	'start': subtitle_start,
	'end' : subtitle_start,
	'word' : f"({segment_longest_speaker})"
	})

	text_words = [text] if not highlight_words and text_original is not None and len(text_original) > 0 else [ this_word["word"] for this_word in words ]

	subtitle_text = __join_words(text_words, maxLineWidth)

	# Iterate over the words in the segment
	if highlight_words:
	text_words_original = [ this_word["word_original"] for this_word in words if "word_original" in this_word ] if text_original is not None and len(text_original) > 0 else None
	last = subtitle_start

	for idx, this_word in enumerate(words):
	start = this_word['start']
	end = this_word['end']

	if last != start:
	# Display the text up to this point
	result = {
	'start': last,
	'end' : start,
	'text' : subtitle_text
	}
	if text_original is not None and len(text_original) > 0:
	result.update({'original': process_text(text_original, maxLineWidth)})
	yield result

	# Display the text with the current word highlighted
	result = {
	'start': start,
	'end' : end,
	'text' : __join_words(
	[
	re.sub(r"^(\s)(.)$", r"\1<u>\2</u>", word) if subidx == idx else word
	for subidx, word in enumerate(text_words)
	]
	, maxLineWidth)
	}
	if text_words_original is not None and len(text_words_original) > 0:
	result.update({'original': __join_words(
	[
	re.sub(r"^(\s)(.)$", r"\1<u>\2</u>", word_original) if subidx == idx else word_original
	for subidx, word_original in enumerate(text_words_original)
	]
	, maxLineWidth)})
	yield result
	last = end

	if last != subtitle_end:
	# Display the last part of the text
	result = {
	'start': last,
	'end' : subtitle_end,
	'text' : subtitle_text
	}
	if text_original is not None and len(text_original) > 0:
	result.update({'original': process_text(text_original, maxLineWidth)})
	yield result

	# Just return the subtitle text
	else:
	result = {
	'start': subtitle_start,
	'end' : subtitle_end,
	'text' : subtitle_text
	}
	if text_original is not None and len(text_original) > 0:
	result.update({'original': process_text(text_original, maxLineWidth)})
	yield result

	def __join_words(words: Iterator[str], maxLineWidth: int = None):
	result = "".join(words)

	if maxLineWidth is None or maxLineWidth < 0:
	return result

	return process_text(result, maxLineWidth)

	def process_text(text: str, maxLineWidth=None):
	"""
	Use east_asian_width to automatically determine the Character Width of the string, replacing the textwrap.wrap function.

	# East_Asian_Width (ea)

	ea ; A ; Ambiguous
	ea ; F ; Fullwidth
	ea ; H ; Halfwidth
	ea ; N ; Neutral
	ea ; Na ; Narrow
	ea ; W ; Wide
	https://stackoverflow.com/a/31666966
	"""
	if (maxLineWidth is None or maxLineWidth < 0):
	return text

	lines = []
	currentLine = ""
	currentWidth = 0

	for word in text.split():
	wordWidth = 0
	wordStart = 0
	if currentLine:
	currentLine += " "
	wordWidth += 1
	# The HTML tags <u> and </u> are not displayed,
	# so they should not be counted in the word length
	wordWidth -= 7 if "<u>" in word else 0
	for wordIdx, char in enumerate(word):
	if unicodedata.east_asian_width(char) not in {'W', 'F'}:
	wordWidth += 1
	else:
	if currentWidth + wordWidth + 2 > maxLineWidth:
	lines.append(currentLine + word[wordStart:wordIdx])
	currentLine = ""
	currentWidth = 0
	wordStart = wordIdx
	wordWidth = 0
	wordWidth += 2

	if currentWidth + wordWidth > maxLineWidth:
	lines.append(currentLine)
	currentLine = word[wordStart:]
	currentWidth = wordWidth
	else:
	currentLine += word[wordStart:]
	currentWidth += wordWidth

	if currentLine:
	lines.append(currentLine)

	return '\n'.join(lines)

	def len_wide(text: str):
	"""
	Use east_asian_width to automatically determine the Character Width of the string, replacing the textwrap.wrap function.

	# East_Asian_Width (ea)

	ea ; A ; Ambiguous
	ea ; F ; Fullwidth
	ea ; H ; Halfwidth
	ea ; N ; Neutral
	ea ; Na ; Narrow
	ea ; W ; Wide
	https://stackoverflow.com/a/31666966
	"""
	width = 0
	for char in text:
	width += (1 if unicodedata.east_asian_width(char) not in {'W', 'F'} else 2)

	return width


	def slugify(value, allow_unicode=False, is_lower=False):
	"""
	Taken from https://github.com/django/django/blob/master/django/utils/text.py
	Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
	dashes to single dashes. Remove characters that aren't alphanumerics,
	underscores, or hyphens. Convert to lowercase. Also strip leading and
	trailing whitespace, dashes, and underscores.
	"""
	value = str(value)
	if allow_unicode:
	value = unicodedata.normalize('NFKC', value)
	else:
	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
	if is_lower:
	value = value.lower()
	value = re.sub(r'[^\w\s-]', '', value.replace("/","_").replace("⧸","_"))
	return re.sub(r'[-\s]+', '-', value).strip('-_')

	def download_file(url: str, destination: str):
	with urllib3.request.urlopen(url) as source, open(destination, "wb") as output:
	with tqdm(
	total=int(source.info().get("Content-Length")),
	ncols=80,
	unit="iB",
	unit_scale=True,
	unit_divisor=1024,
	) as loop:
	while True:
	buffer = source.read(8192)
	if not buffer:
	break

	output.write(buffer)
	loop.update(len(buffer))