whisper-webui-translate

Running

App Files Files Community

avans06 commited on Dec 8, 2023

Commit

1e744c4

•

1 Parent(s): 28514b1

Enhanced Translation Model translation capabilities and optimized the Web UI interface.

Browse files

1.Translation Model Enhancements:

* Added support for the M2M100 model.
* Added three new options for the translation model: Batch Size, No Repeat Ngram Size, Num Beams.
* When using the Translation Model for translation, it will now generate additional subtitle (srt) files for the original language (*-original.srt) and bilingual (*-bilingual.srt).
* In response to adjustments in the Translation Model functionality, nllbLangs has been renamed to translationLangs, and nllbModel has been renamed to translationModel.

2.Web UI Enhancements:

* Placed the translation model under tabs, with tabs for M2M100, NLLB, MT5.
* Organized the audio input under tabs for URL, Upload, Microphone.
* Categorized VAD options under tabs for VAD, Merge Window, Max Merge Size, Padding, Prompt Window, Initial Prompt Mode.
* Grouped Word Timestamps options under tabs for Word Timestamps, Highlight Words, Prepend Punctuations, Append Punctuations.
* On the Full page, the Whisper Advanced options have been organized into tabs, including Initial Prompt, Temperature, Best Of, Beam Size, Patience, Length Penalty, Suppress Tokens, Condition on previous text, FP16, Temperature increment on fallback, Compression ratio threshold, Logprob threshold, and No speech threshold.

3.New advanced options and program adjustments for Whisper:

* In the Whisper Advanced options on the Full page, Repetition Penalty and No Repeat Ngram Size options have been added for use with faster-whisper.
* Merged languages into translationLangs.

Files changed (14) hide show

app.py +485 -252
cli.py +2 -2
config.json5 +289 -243
requirements-whisper.txt +0 -1
src/config.py +28 -24
src/languages.py +0 -147
src/nllb/nllbLangs.py +0 -251
src/translation/translationLangs.py +303 -0
src/{nllb/nllbModel.py → translation/translationModel.py} +88 -72
src/utils.py +79 -31
src/vad.py +2 -3
src/whisper/abstractWhisperContainer.py +3 -3
src/whisper/fasterWhisperContainer.py +7 -17
src/whisper/whisperContainer.py +7 -7

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from datetime import datetime
 import json
 import math
-from typing import Iterator, Union
 import argparse
 from io import StringIO
@@ -20,7 +20,6 @@ from src.diarization.diarizationContainer import DiarizationContainer
 from src.hooks.progressListener import ProgressListener
 from src.hooks.subTaskProgressListener import SubTaskProgressListener
 from src.hooks.whisperProgressHook import create_progress_listener_handle
-from src.languages import _TO_LANGUAGE_CODE, get_language_names, get_language_from_name, get_language_from_code
 from src.modelCache import ModelCache
 from src.prompts.jsonPromptStrategy import JsonPromptStrategy
 from src.prompts.prependPromptStrategy import PrependPromptStrategy
@@ -34,18 +33,18 @@ import ffmpeg
 import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
-from src.utils import optional_int, slugify, str2bool, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
 from src.whisper.whisperFactory import create_whisper_container
-from src.nllb.nllbModel import NllbModel
-from src.nllb.nllbLangs import _TO_NLLB_LANG_CODE
-from src.nllb.nllbLangs import get_nllb_lang_names
-from src.nllb.nllbLangs import get_nllb_lang_from_name
 import shutil
 import zhconv
 import tqdm
 # Configure more application defaults in config.json5
@@ -114,120 +113,231 @@ class WhisperTranscriber:
             self.diarization.cleanup()
         self.diarization_kwargs = None
-    # Entry function for the simple tab
-    def transcribe_webui_simple(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
-                                vad, vadMergeWindow, vadMaxMergeSize,
-                                word_timestamps: bool = False, highlight_words: bool = False,
-                                diarization: bool = False, diarization_speakers: int = 2,
-                                diarization_min_speakers = 1, diarization_max_speakers = 8):
-        return self.transcribe_webui_simple_progress(modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
-                                vad, vadMergeWindow, vadMaxMergeSize,
-                                word_timestamps, highlight_words,
-                                diarization, diarization_speakers,
-                                diarization_min_speakers, diarization_max_speakers)
-    # Entry function for the simple tab progress
-    def transcribe_webui_simple_progress(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
-                                         vad, vadMergeWindow, vadMaxMergeSize,
-                                         word_timestamps: bool = False, highlight_words: bool = False,
-                                         diarization: bool = False, diarization_speakers: int = 2,
-                                        diarization_min_speakers = 1, diarization_max_speakers = 8,
-                                         progress=gr.Progress()):
-        vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
-        if diarization:
-            if diarization_speakers is not None and diarization_speakers < 1:
-                self.set_diarization(auth_token=self.app_config.auth_token, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
             else:
-                self.set_diarization(auth_token=self.app_config.auth_token, num_speakers=diarization_speakers, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
-        else:
-            self.unset_diarization()
-        return self.transcribe_webui(modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task, vadOptions,
-                                     word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
-    # Entry function for the full tab
-    def transcribe_webui_full(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
-                              vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
-                              # Word timestamps
-                              word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
-                              initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
-                              condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
-                              compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
-                              diarization: bool = False, diarization_speakers: int = 2,
-                              diarization_min_speakers = 1, diarization_max_speakers = 8):
-        return self.transcribe_webui_full_progress(modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
-                                vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
-                                word_timestamps, highlight_words, prepend_punctuations, append_punctuations,
-                                initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
-                                condition_on_previous_text, fp16, temperature_increment_on_fallback,
-                                compression_ratio_threshold, logprob_threshold, no_speech_threshold,
-                                diarization, diarization_speakers,
-                                diarization_min_speakers, diarization_max_speakers)
-    # Entry function for the full tab with progress
-    def transcribe_webui_full_progress(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
-                                        vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
-                                        # Word timestamps
-                                        word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
-                                        initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
-                                        condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
-                                        compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
-                                        diarization: bool = False, diarization_speakers: int = 2,
-                                        diarization_min_speakers = 1, diarization_max_speakers = 8,
-                                        progress=gr.Progress()):
-        # Handle temperature_increment_on_fallback
-        if temperature_increment_on_fallback is not None:
-            temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
-        else:
-            temperature = [temperature]
-        vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
-        # Set diarization
-        if diarization:
-            if diarization_speakers is not None and diarization_speakers < 1:
-                self.set_diarization(auth_token=self.app_config.auth_token, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
-            else:
-                self.set_diarization(auth_token=self.app_config.auth_token, num_speakers=diarization_speakers, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
-        else:
-            self.unset_diarization()
-        return self.transcribe_webui(modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task, vadOptions,
-                                     initial_prompt=initial_prompt, temperature=temperature, best_of=best_of, beam_size=beam_size, patience=patience, length_penalty=length_penalty, suppress_tokens=suppress_tokens,
-                                     condition_on_previous_text=condition_on_previous_text, fp16=fp16,
-                                     compression_ratio_threshold=compression_ratio_threshold, logprob_threshold=logprob_threshold, no_speech_threshold=no_speech_threshold,
-                                     word_timestamps=word_timestamps, prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, highlight_words=highlight_words,
-                                     progress=progress)
-    def transcribe_webui(self, modelName: str, languageName: str, nllbModelName: str, nllbLangName: str, urlData: str, multipleFiles, microphoneData: str, task: str,
-                         vadOptions: VadOptions, progress: gr.Progress = None, highlight_words: bool = False,
-                         **decodeOptions: dict):
-        try:
             progress(0, desc="init audio sources")
-            sources = self.__get_source(urlData, multipleFiles, microphoneData)
             if (len(sources) == 0):
                 raise Exception("init audio sources failed...")
             try:
                 progress(0, desc="init whisper model")
-                whisper_lang = get_language_from_name(languageName)
-                selectedLanguage = languageName.lower() if languageName is not None and len(languageName) > 0 else None
-                selectedModel = modelName if modelName is not None else "base"
                 model = create_whisper_container(whisper_implementation=self.app_config.whisper_implementation,
                                                  model_name=selectedModel, compute_type=self.app_config.compute_type,
-                                                 cache=self.model_cache, models=self.app_config.models)
                 progress(0, desc="init translate model")
-                nllb_lang = get_nllb_lang_from_name(nllbLangName)
-                selectedNllbModelName = nllbModelName if nllbModelName is not None and len(nllbModelName) > 0 else "nllb-200-distilled-600M/facebook"
-                selectedNllbModel = next((modelConfig for modelConfig in self.app_config.nllb_models if modelConfig.name == selectedNllbModelName), None)
-                nllb_model = NllbModel(model_config=selectedNllbModel, whisper_lang=whisper_lang, nllb_lang=nllb_lang) # load_model=True
                 progress(0, desc="init transcribe")
                 # Result
                 download = []
@@ -238,7 +348,7 @@ class WhisperTranscriber:
                 # Write result
                 downloadDirectory = tempfile.mkdtemp()
                 source_index = 0
-                extra_tasks_count = 1 if nllb_lang is not None else 0
                 outputDirectory = self.output_dir if self.output_dir is not None else downloadDirectory
@@ -267,10 +377,10 @@ class WhisperTranscriber:
                                                    sub_task_total=sub_task_total)
                     # Transcribe
-                    result = self.transcribe_file(model, source.source_path, selectedLanguage, task, vadOptions, scaled_progress_listener, **decodeOptions)
-                    if whisper_lang is None and result["language"] is not None and len(result["language"]) > 0:
-                        whisper_lang = get_language_from_code(result["language"])
-                        nllb_model.whisper_lang = whisper_lang
                     short_name, suffix = source.get_short_name_suffix(max_length=self.app_config.input_max_file_name_length)
                     filePrefix = slugify(source_prefix + short_name, allow_unicode=True)
@@ -278,7 +388,7 @@ class WhisperTranscriber:
                     # Update progress
                     current_progress += source_audio_duration
-                    source_download, source_text, source_vtt = self.write_result(result, nllb_model, filePrefix + suffix.replace(".", "_"), outputDirectory, highlight_words, scaled_progress_listener)
                     if self.app_config.merge_subtitle_with_sources and self.app_config.output_dir is not None:
                         print("\nmerge subtitle(srt) with source file [" + source.source_name + "]\n")
@@ -287,8 +397,8 @@ class WhisperTranscriber:
                             srt_path = source_download[0]
                             save_path = os.path.join(self.app_config.output_dir, filePrefix)
                             # save_without_ext, ext = os.path.splitext(save_path)
-                            source_lang = "." + whisper_lang.code if whisper_lang is not None else ""
-                            translate_lang = "." + nllb_lang.code if nllb_lang is not None else ""
                             output_with_srt = save_path + source_lang + translate_lang + suffix
                             #ffmpeg -i "input.mp4" -i "input.srt" -c copy -c:s mov_text output.mp4
@@ -363,12 +473,11 @@ class WhisperTranscriber:
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
         except Exception as e:
-            import traceback
             print(traceback.format_exc())
-            return [], ("Error occurred during transcribe: " + str(e)), ""
-    def transcribe_file(self, model: AbstractWhisperContainer, audio_path: str, language: str, task: str = None,
                         vadOptions: VadOptions = VadOptions(),
                         progressListener: ProgressListener = None, **decodeOptions: dict):
@@ -398,7 +507,7 @@ class WhisperTranscriber:
             raise ValueError("Invalid vadInitialPromptMode: " + initial_prompt_mode)
         # Callable for processing an audio file
-        whisperCallable = model.create_callback(language, task, prompt_strategy=prompt_strategy, **decodeOptions)
         # The results
         if (vadOptions.vad == 'silero-vad'):
@@ -513,7 +622,7 @@ class WhisperTranscriber:
         return config
-    def write_result(self, result: dict, nllb_model: NllbModel, source_name: str, output_dir: str, highlight_words: bool = False, progressListener: ProgressListener = None):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
@@ -522,7 +631,7 @@ class WhisperTranscriber:
         language = result["language"]
         languageMaxLineWidth = self.__get_max_line_width(language)
-        if nllb_model.nllb_lang is not None:
             try:
                 segments_progress_listener = SubTaskProgressListener(progressListener,
                                                base_task_total=progressListener.sub_task_total,
@@ -530,17 +639,15 @@ class WhisperTranscriber:
                                                sub_task_total=1)
                 pbar = tqdm.tqdm(total=len(segments))
                 perf_start_time = time.perf_counter()
-                nllb_model.load_model()
                 for idx, segment in enumerate(segments):
                     seg_text = segment["text"]
-                    if language == "zh":
-                        segment["text"] = zhconv.convert(seg_text, "zh-tw")
-                    if nllb_model.nllb_lang is not None:
-                        segment["text"] = nllb_model.translation(seg_text)
                     pbar.update(1)
                     segments_progress_listener.on_progress(idx+1, len(segments), desc=f"Process segments: {idx}/{len(segments)}")
-                nllb_model.release_vram()
                 perf_end_time = time.perf_counter()
                 # Call the finished callback
                 if segments_progress_listener is not None:
@@ -549,24 +656,57 @@ class WhisperTranscriber:
                 print("\n\nprocess segments took {} seconds.\n\n".format(perf_end_time - perf_start_time))
             except Exception as e:
                 # Ignore error - it's just a cleanup
                 print("Error process segments: " + str(e))
         print("Max line width " + str(languageMaxLineWidth) + " for language:" + language)
         vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
         srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
         json_result = json.dumps(result, indent=4, ensure_ascii=False)
-        if language == "zh" or (nllb_model.nllb_lang is not None and nllb_model.nllb_lang.code == "zho_Hant"):
-            vtt = zhconv.convert(vtt, "zh-tw")
-            srt = zhconv.convert(srt, "zh-tw")
-            text = zhconv.convert(text, "zh-tw")
-            json_result = zhconv.convert(json_result, "zh-tw")
         output_files = []
         output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
         output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
         output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
         output_files.append(self.__create_file(json_result, output_dir, source_name + "-result.json"));
         return output_files, text, vtt
@@ -593,6 +733,10 @@ class WhisperTranscriber:
             write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
         elif format == 'srt':
             write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
         else:
             raise Exception("Unknown format " + format)
@@ -621,6 +765,16 @@ class WhisperTranscriber:
             self.diarization = None
 def create_ui(app_config: ApplicationConfig):
     ui = WhisperTranscriber(app_config.input_audio_max_duration, app_config.vad_process_timeout, app_config.vad_cpu_cores,
                             app_config.delete_uploaded_files, app_config.output_dir, app_config)
@@ -639,59 +793,69 @@ def create_ui(app_config: ApplicationConfig):
         # Try to convert from camel-case to title-case
         implementation_name = app_config.whisper_implementation.title().replace("_", " ").replace("-", " ")
-    ui_description = implementation_name + " is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
-    ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
-    ui_description += " as well as speech translation and language identification. "
-    ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
     # Recommend faster-whisper
     if is_whisper:
-        ui_description += "\n\n\n\nFor faster inference on GPU, try [faster-whisper](https://huggingface.co/spaces/aadnk/faster-whisper-webui)."
     if app_config.input_audio_max_duration > 0:
-        ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
-    ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
-    ui_article += "\n\nWhisper's Task 'translate' only implements the functionality of translating other languages into English. "
-    ui_article += "OpenAI does not guarantee translations between arbitrary languages. In such cases, you can choose to use the NLLB Model to implement the translation task. "
-    ui_article += "However, it's important to note that the NLLB Model runs slowly, and the completion time may be twice as long as usual. "
-    ui_article += "\n\nThe larger the parameters of the NLLB model, the better its performance is expected to be. "
-    ui_article += "However, it also requires higher computational resources, making it slower to operate. "
-    ui_article += "On the other hand, the version converted from ct2 (CTranslate2) requires lower resources and operates at a faster speed."
-    ui_article += "\n\nCurrently, enabling word-level timestamps cannot be used in conjunction with NLLB Model translation "
-    ui_article += "because Word Timestamps will split the source text, and after translation, it becomes a non-word-level string. "
-    ui_article += "\n\nThe 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. "
-    ui_article += "This model has a relatively good translation speed, but it only supports three languages: Chinese, Japanese, and English. "
-    whisper_models = app_config.get_model_names()
-    nllb_models = app_config.get_nllb_model_names()
-    common_whisper_inputs = lambda : [
-        gr.Dropdown(label="Whisper Model (for audio)", choices=whisper_models, value=app_config.default_model_name),
-        gr.Dropdown(label="Whisper Language", choices=sorted(get_language_names()), value=app_config.language),
-    ]
-    common_nllb_inputs = lambda : [
-        gr.Dropdown(label="NLLB Model (for translate)", choices=nllb_models),
-        gr.Dropdown(label="NLLB Language", choices=sorted(get_nllb_lang_names())),
-    ]
-    common_audio_inputs = lambda : [
-        gr.Text(label="URL (YouTube, etc.)"),
-        gr.File(label="Upload Files", file_count="multiple"),
-        gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
-        gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
-    ]
-    common_vad_inputs = lambda : [
-        gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
-        gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
-        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
-    ]
-    common_word_timestamps_inputs = lambda : [
-        gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
-        gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
-    ]
     has_diarization_libs = Diarization.has_libraries()
@@ -699,12 +863,12 @@ def create_ui(app_config: ApplicationConfig):
         print("Diarization libraries not found - disabling diarization")
         app_config.diarization = False
-    common_diarization_inputs = lambda : [
-        gr.Checkbox(label="Diarization", value=app_config.diarization, interactive=has_diarization_libs),
-        gr.Number(label="Diarization - Speakers", precision=0, value=app_config.diarization_speakers, interactive=has_diarization_libs),
-        gr.Number(label="Diarization - Min Speakers", precision=0, value=app_config.diarization_min_speakers, interactive=has_diarization_libs),
-        gr.Number(label="Diarization - Max Speakers", precision=0, value=app_config.diarization_max_speakers, interactive=has_diarization_libs)
-    ]
     common_output = lambda : [
         gr.File(label="Download"),
@@ -714,84 +878,152 @@ def create_ui(app_config: ApplicationConfig):
     is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
-    simple_callback = gr.CSVLogger()
-    with gr.Blocks() as simple_transcribe:
-        gr.Markdown(ui_description)
         with gr.Row():
             with gr.Column():
-                simple_submit = gr.Button("Submit", variant="primary")
                 with gr.Column():
                     with gr.Row():
-                        simple_input = common_whisper_inputs()
-                    with gr.Row():
-                        simple_input += common_nllb_inputs()
                 with gr.Column():
-                    simple_input += common_audio_inputs() + common_vad_inputs() + common_word_timestamps_inputs() + common_diarization_inputs()
             with gr.Column():
-                simple_output = common_output()
-                simple_flag = gr.Button("Flag")
-        gr.Markdown(ui_article)
-        # This needs to be called at some point prior to the first call to callback.flag()
-        simple_callback.setup(simple_input + simple_output, "flagged")
-        simple_submit.click(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
-                    inputs=simple_input, outputs=simple_output)
-        # We can choose which components to flag -- in this case, we'll flag all of them
-        simple_flag.click(lambda *args: print("simple_callback.flag...") or simple_callback.flag(args), simple_input + simple_output, None, preprocess=False)
-    full_description = ui_description + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
-    full_callback = gr.CSVLogger()
-    with gr.Blocks() as full_transcribe:
-        gr.Markdown(full_description)
         with gr.Row():
             with gr.Column():
-                full_submit = gr.Button("Submit", variant="primary")
                 with gr.Column():
                     with gr.Row():
-                        full_input1 = common_whisper_inputs()
-                    with gr.Row():
-                        full_input1 += common_nllb_inputs()
                 with gr.Column():
-                    full_input1 += common_audio_inputs() + common_vad_inputs() + [
-                    gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
-                    gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
-                    gr.Dropdown(choices=VAD_INITIAL_PROMPT_MODE_VALUES, label="VAD - Initial Prompt Mode")]
-                    full_input2 = common_word_timestamps_inputs() + [
-                    gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
-                    gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
-                    gr.TextArea(label="Initial Prompt"),
-                    gr.Number(label="Temperature", value=app_config.temperature),
-                    gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
-                    gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0),
-                    gr.Number(label="Patience - Zero temperature", value=app_config.patience),
-                    gr.Number(label="Length Penalty - Any temperature", value=app_config.length_penalty),
-                    gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens),
-                    gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text),
-                    gr.Checkbox(label="FP16", value=app_config.fp16),
-                    gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
-                    gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
-                    gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
-                    gr.Number(label="No speech threshold", value=app_config.no_speech_threshold)] + common_diarization_inputs()
             with gr.Column():
-                full_output = common_output()
-                full_flag = gr.Button("Flag")
-        gr.Markdown(ui_article)
-        # This needs to be called at some point prior to the first call to callback.flag()
-        full_callback.setup(full_input1 + full_input2 + full_output, "flagged")
-        full_submit.click(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
-                    inputs=full_input1+full_input2, outputs=full_output)
-        # We can choose which components to flag -- in this case, we'll flag all of them
-        full_flag.click(lambda *args: print("full_callback.flag...") or full_callback.flag(args), full_input1 + full_input2 + full_output, None, preprocess=False)
-    demo = gr.TabbedInterface([simple_transcribe, full_transcribe], tab_names=["Simple", "Full"])
     # Queue up the demo
     if is_queue_mode:
@@ -807,8 +1039,7 @@ def create_ui(app_config: ApplicationConfig):
 if __name__ == '__main__':
     default_app_config = ApplicationConfig.create_default()
-    whisper_models = default_app_config.get_model_names()
-    nllb_models = default_app_config.get_nllb_model_names()
     # Environment variable overrides
     default_whisper_implementation = os.environ.get("WHISPER_IMPLEMENTATION", default_app_config.whisper_implementation)
@@ -846,9 +1077,10 @@ if __name__ == '__main__':
                         help="the compute type to use for inference")
     parser.add_argument("--threads", type=optional_int, default=0,
                         help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
     parser.add_argument("--vad_max_merge_size", type=int, default=default_app_config.vad_max_merge_size, \
                         help="The number of VAD - Max Merge Size (s).") # 30
-    parser.add_argument("--language", type=str, default=None, choices=sorted(get_language_names()) + sorted([k.title() for k in _TO_LANGUAGE_CODE.keys()]),
                         help="language spoken in the audio, specify None to perform language detection")
     parser.add_argument("--save_downloaded_files", action='store_true', \
                         help="True to move downloaded files to outputs directory. This argument will take effect only after output_dir is set.")
@@ -858,6 +1090,7 @@ if __name__ == '__main__':
                         help="Maximum length of a file name.")
     parser.add_argument("--autolaunch", action='store_true', \
                         help="open the webui URL in the system's default browser upon launch")
     parser.add_argument('--auth_token', type=str, default=default_app_config.auth_token, help='HuggingFace API Token (optional)')
     parser.add_argument("--diarization", type=str2bool, default=default_app_config.diarization, \
                         help="whether to perform speaker diarization")

 from datetime import datetime
 import json
 import math
+from typing import Iterator, Union, List
 import argparse
 from io import StringIO
 from src.hooks.progressListener import ProgressListener
 from src.hooks.subTaskProgressListener import SubTaskProgressListener
 from src.hooks.whisperProgressHook import create_progress_listener_handle
 from src.modelCache import ModelCache
 from src.prompts.jsonPromptStrategy import JsonPromptStrategy
 from src.prompts.prependPromptStrategy import PrependPromptStrategy
 import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
+from src.utils import optional_int, slugify, str2bool, write_srt, write_srt_original, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
 from src.whisper.whisperFactory import create_whisper_container
+from src.translation.translationModel import TranslationModel
+from src.translation.translationLangs import (TranslationLang,
+                                              _TO_LANG_CODE_WHISPER, get_lang_whisper_names, get_lang_from_whisper_name, get_lang_from_whisper_code,
+                                              get_lang_nllb_names, get_lang_from_nllb_name, get_lang_m2m100_names, get_lang_from_m2m100_name)
 import shutil
 import zhconv
 import tqdm
+import traceback
 # Configure more application defaults in config.json5
             self.diarization.cleanup()
         self.diarization_kwargs = None
+    # Entry function for the simple tab, Queue mode disabled: progress bars will not be shown
+    def transcribe_webui_simple(self, data: dict): return self.transcribe_webui_simple_progress(data)
+    # Entry function for the simple tab progress, Progress tracking requires queuing to be enabled
+    def transcribe_webui_simple_progress(self, data: dict, progress=gr.Progress()):
+        dataDict = {}
+        for key, value in data.items():
+            dataDict.update({key.elem_id: value})
+        return self.transcribe_webui(dataDict, progress=progress)
+    # Entry function for the full tab, Queue mode disabled: progress bars will not be shown
+    def transcribe_webui_full(self, data: dict): return self.transcribe_webui_full_progress(data)
+    # Entry function for the full tab with progress, Progress tracking requires queuing to be enabled
+    def transcribe_webui_full_progress(self, data: dict, progress=gr.Progress()):
+        dataDict = {}
+        for key, value in data.items():
+            dataDict.update({key.elem_id: value})
+        return self.transcribe_webui(dataDict, progress=progress)
+    def transcribe_webui(self, decodeOptions: dict, progress: gr.Progress = None):
+        """
+        Transcribe an audio file using Whisper
+        https://github.com/openai/whisper/blob/main/whisper/transcribe.py#L37
+        Parameters
+        ----------
+        model: Whisper
+            The Whisper model instance
+        temperature: Union[float, Tuple[float, ...]]
+            Temperature for sampling. It can be a tuple of temperatures, which will be successively used
+            upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
+        compression_ratio_threshold: float
+            If the gzip compression ratio is above this value, treat as failed
+        logprob_threshold: float
+            If the average log probability over sampled tokens is below this value, treat as failed
+        no_speech_threshold: float
+            If the no_speech probability is higher than this value AND the average log probability
+            over sampled tokens is below `logprob_threshold`, consider the segment as silent
+        condition_on_previous_text: bool
+            if True, the previous output of the model is provided as a prompt for the next window;
+            disabling may make the text inconsistent across windows, but the model becomes less prone to
+            getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
+        word_timestamps: bool
+            Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
+            and include the timestamps for each word in each segment.
+        prepend_punctuations: str
+            If word_timestamps is True, merge these punctuation symbols with the next word
+        append_punctuations: str
+            If word_timestamps is True, merge these punctuation symbols with the previous word
+        initial_prompt: Optional[str]
+            Optional text to provide as a prompt for the first window. This can be used to provide, or
+            "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+            to make it more likely to predict those word correctly.
+        decode_options: dict
+            Keyword arguments to construct `DecodingOptions` instances
+            https://github.com/openai/whisper/blob/main/whisper/decoding.py#L81
+            task: str = "transcribe"
+                whether to perform X->X "transcribe" or X->English "translate"
+            language: Optional[str] = None
+                language that the audio is in; uses detected language if None
+            temperature: float = 0.0
+            sample_len: Optional[int] = None  # maximum number of tokens to sample
+            best_of: Optional[int] = None  # number of independent sample trajectories, if t > 0
+            beam_size: Optional[int] = None  # number of beams in beam search, if t == 0
+            patience: Optional[float] = None  # patience in beam search (arxiv:2204.05424)
+                sampling-related options
+            length_penalty: Optional[float] = None
+                "alpha" in Google NMT, or None for length norm, when ranking generations
+                to select which to return among the beams or best-of-N samples
+            prompt: Optional[Union[str, List[int]]] = None  # for the previous context
+            prefix: Optional[Union[str, List[int]]] = None  # to prefix the current context
+                text or tokens to feed as the prompt or the prefix; for more info:
+                https://github.com/openai/whisper/discussions/117#discussioncomment-3727051
+            suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
+            suppress_blank: bool = True  # this will suppress blank outputs
+                list of tokens ids (or comma-separated token ids) to suppress
+                "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
+            without_timestamps: bool = False  # use <|notimestamps|> to sample text tokens only
+            max_initial_timestamp: Optional[float] = 1.0
+                timestamp sampling options
+            fp16: bool = True  # use fp16 for most of the calculation
+                implementation details
+        repetition_penalty: float
+            The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
+        no_repeat_ngram_size: int
+            The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.
+        """
+        try:
+            whisperModelName: str = decodeOptions.pop("whisperModelName")
+            whisperLangName:  str = decodeOptions.pop("whisperLangName")
+            translateInput:   str = decodeOptions.pop("translateInput")
+            m2m100ModelName:  str = decodeOptions.pop("m2m100ModelName")
+            m2m100LangName:   str = decodeOptions.pop("m2m100LangName")
+            nllbModelName:    str = decodeOptions.pop("nllbModelName")
+            nllbLangName:     str = decodeOptions.pop("nllbLangName")
+            mt5ModelName:     str = decodeOptions.pop("mt5ModelName")
+            mt5LangName:      str = decodeOptions.pop("mt5LangName")
+            translationBatchSize:         int = decodeOptions.pop("translationBatchSize")
+            translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
+            translationNumBeams:          int = decodeOptions.pop("translationNumBeams")
+            sourceInput:    str  = decodeOptions.pop("sourceInput")
+            urlData:        str  = decodeOptions.pop("urlData")
+            multipleFiles:  List = decodeOptions.pop("multipleFiles")
+            microphoneData: str  = decodeOptions.pop("microphoneData")
+            task:           str  = decodeOptions.pop("task")
+            vad:                 str   = decodeOptions.pop("vad")
+            vadMergeWindow:      float = decodeOptions.pop("vadMergeWindow")
+            vadMaxMergeSize:     float = decodeOptions.pop("vadMaxMergeSize")
+            vadPadding:          float = decodeOptions.pop("vadPadding", self.app_config.vad_padding)
+            vadPromptWindow:     float = decodeOptions.pop("vadPromptWindow", self.app_config.vad_prompt_window)
+            vadInitialPromptMode: str  = decodeOptions.pop("vadInitialPromptMode", self.app_config.vad_initial_prompt_mode)
+            diarization:              bool = decodeOptions.pop("diarization", False)
+            diarization_speakers:     int  = decodeOptions.pop("diarization_speakers", 2)
+            diarization_min_speakers: int  = decodeOptions.pop("diarization_min_speakers", 1)
+            diarization_max_speakers: int  = decodeOptions.pop("diarization_max_speakers", 8)
+            highlight_words:          bool = decodeOptions.pop("highlight_words", False)
+            temperature: float = decodeOptions.pop("temperature", None)
+            temperature_increment_on_fallback: float = decodeOptions.pop("temperature_increment_on_fallback", None)
+            whisperRepetitionPenalty: float = decodeOptions.get("repetition_penalty", None)
+            whisperNoRepeatNgramSize: int = decodeOptions.get("no_repeat_ngram_size", None)
+            if whisperRepetitionPenalty is not None and whisperRepetitionPenalty <= 1.0:
+                decodeOptions.pop("repetition_penalty")
+            if whisperNoRepeatNgramSize is not None and whisperNoRepeatNgramSize <= 1:
+                decodeOptions.pop("no_repeat_ngram_size")
+            # word_timestamps                   = options.get("word_timestamps", False)
+            # condition_on_previous_text        = options.get("condition_on_previous_text", False)
+            # prepend_punctuations              = options.get("prepend_punctuations", None)
+            # append_punctuations               = options.get("append_punctuations", None)
+            # initial_prompt                    = options.get("initial_prompt", None)
+            # best_of                           = options.get("best_of", None)
+            # beam_size                         = options.get("beam_size", None)
+            # patience                          = options.get("patience", None)
+            # length_penalty                    = options.get("length_penalty", None)
+            # suppress_tokens                   = options.get("suppress_tokens", None)
+            # compression_ratio_threshold       = options.get("compression_ratio_threshold", None)
+            # logprob_threshold                 = options.get("logprob_threshold", None)
+            vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
+            if diarization:
+                if diarization_speakers is not None and diarization_speakers < 1:
+                    self.set_diarization(auth_token=self.app_config.auth_token, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
+                else:
+                    self.set_diarization(auth_token=self.app_config.auth_token, num_speakers=diarization_speakers, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
             else:
+                self.unset_diarization()
+            # Handle temperature_increment_on_fallback
+            if temperature is not None:
+                if temperature_increment_on_fallback is not None:
+                    temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
+                else:
+                    temperature = [temperature]
+                decodeOptions["temperature"] = temperature
             progress(0, desc="init audio sources")
+            if sourceInput == "urlData":
+                sources = self.__get_source(urlData, None, None)
+            elif sourceInput == "multipleFiles":
+                sources = self.__get_source(None, multipleFiles, None)
+            elif sourceInput == "microphoneData":
+                sources = self.__get_source(None, None, microphoneData)
             if (len(sources) == 0):
                 raise Exception("init audio sources failed...")
             try:
                 progress(0, desc="init whisper model")
+                whisperLang: TranslationLang = get_lang_from_whisper_name(whisperLangName)
+                whisperLangCode = whisperLang.whisper.code if whisperLang is not None and whisperLang.whisper is not None else None
+                selectedModel = whisperModelName if whisperModelName is not None else "base"
                 model = create_whisper_container(whisper_implementation=self.app_config.whisper_implementation,
                                                  model_name=selectedModel, compute_type=self.app_config.compute_type,
+                                                 cache=self.model_cache, models=self.app_config.models["whisper"])
                 progress(0, desc="init translate model")
+                translationLang = None
+                translationModel = None
+                if translateInput == "m2m100" and m2m100LangName is not None and len(m2m100LangName) > 0:
+                    selectedModelName = m2m100ModelName if m2m100ModelName is not None and len(m2m100ModelName) > 0 else "m2m100_418M/facebook"
+                    selectedModel = next((modelConfig for modelConfig in self.app_config.models["m2m100"] if modelConfig.name == selectedModelName), None)
+                    translationLang = get_lang_from_m2m100_name(m2m100LangName)
+                elif translateInput == "nllb" and nllbLangName is not None and len(nllbLangName) > 0:
+                    selectedModelName = nllbModelName if nllbModelName is not None and len(nllbModelName) > 0 else "nllb-200-distilled-600M/facebook"
+                    selectedModel = next((modelConfig for modelConfig in self.app_config.models["nllb"] if modelConfig.name == selectedModelName), None)
+                    translationLang = get_lang_from_nllb_name(nllbLangName)
+                elif translateInput == "mt5" and mt5LangName is not None and len(mt5LangName) > 0:
+                    selectedModelName = mt5ModelName if mt5ModelName is not None and len(mt5ModelName) > 0 else "mt5-zh-ja-en-trimmed/K024"
+                    selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
+                    translationLang = get_lang_from_m2m100_name(mt5LangName)
+                if translationLang is not None:
+                    translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams)
                 progress(0, desc="init transcribe")
                 # Result
                 download = []
                 # Write result
                 downloadDirectory = tempfile.mkdtemp()
                 source_index = 0
+                extra_tasks_count = 1 if translationLang is not None else 0
                 outputDirectory = self.output_dir if self.output_dir is not None else downloadDirectory
                                                    sub_task_total=sub_task_total)
                     # Transcribe
+                    result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
+                    if whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
+                        whisperLang = get_lang_from_whisper_code(result["language"])
+                        translationModel.whisperLang = whisperLang
                     short_name, suffix = source.get_short_name_suffix(max_length=self.app_config.input_max_file_name_length)
                     filePrefix = slugify(source_prefix + short_name, allow_unicode=True)
                     # Update progress
                     current_progress += source_audio_duration
+                    source_download, source_text, source_vtt = self.write_result(result, whisperLang, translationModel, filePrefix + suffix.replace(".", "_"), outputDirectory, highlight_words, scaled_progress_listener)
                     if self.app_config.merge_subtitle_with_sources and self.app_config.output_dir is not None:
                         print("\nmerge subtitle(srt) with source file [" + source.source_name + "]\n")
                             srt_path = source_download[0]
                             save_path = os.path.join(self.app_config.output_dir, filePrefix)
                             # save_without_ext, ext = os.path.splitext(save_path)
+                            source_lang = "." + whisperLang.whisper.code if whisperLang is not None and whisperLang.whisper is not None else ""
+                            translate_lang = "." + translationLang.nllb.code if translationLang is not None else ""
                             output_with_srt = save_path + source_lang + translate_lang + suffix
                             #ffmpeg -i "input.mp4" -i "input.srt" -c copy -c:s mov_text output.mp4
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
         except Exception as e:
             print(traceback.format_exc())
+            return [], ("Error occurred during transcribe: " + str(e)), traceback.format_exc()
+    def transcribe_file(self, model: AbstractWhisperContainer, audio_path: str, languageCode: str, task: str = None,
                         vadOptions: VadOptions = VadOptions(),
                         progressListener: ProgressListener = None, **decodeOptions: dict):
             raise ValueError("Invalid vadInitialPromptMode: " + initial_prompt_mode)
         # Callable for processing an audio file
+        whisperCallable = model.create_callback(languageCode, task, prompt_strategy=prompt_strategy, **decodeOptions)
         # The results
         if (vadOptions.vad == 'silero-vad'):
         return config
+    def write_result(self, result: dict, whisperLang: TranslationLang, translationModel: TranslationModel, source_name: str, output_dir: str, highlight_words: bool = False, progressListener: ProgressListener = None):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
         language = result["language"]
         languageMaxLineWidth = self.__get_max_line_width(language)
+        if translationModel is not None and translationModel.translationLang is not None:
             try:
                 segments_progress_listener = SubTaskProgressListener(progressListener,
                                                base_task_total=progressListener.sub_task_total,
                                                sub_task_total=1)
                 pbar = tqdm.tqdm(total=len(segments))
                 perf_start_time = time.perf_counter()
+                translationModel.load_model()
                 for idx, segment in enumerate(segments):
                     seg_text = segment["text"]
+                    segment["original"] = seg_text
+                    segment["text"] = translationModel.translation(seg_text)
                     pbar.update(1)
                     segments_progress_listener.on_progress(idx+1, len(segments), desc=f"Process segments: {idx}/{len(segments)}")
+                translationModel.release_vram()
                 perf_end_time = time.perf_counter()
                 # Call the finished callback
                 if segments_progress_listener is not None:
                 print("\n\nprocess segments took {} seconds.\n\n".format(perf_end_time - perf_start_time))
             except Exception as e:
                 # Ignore error - it's just a cleanup
+                print(traceback.format_exc())
                 print("Error process segments: " + str(e))
         print("Max line width " + str(languageMaxLineWidth) + " for language:" + language)
         vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
         srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
         json_result = json.dumps(result, indent=4, ensure_ascii=False)
+        srt_original = None
+        srt_bilingual = None
+        if translationModel is not None and translationModel.translationLang is not None:
+            srt_original  = self.__get_subs(result["segments"], "srt_original", languageMaxLineWidth, highlight_words=highlight_words)
+            srt_bilingual = self.__get_subs(result["segments"], "srt_bilingual", languageMaxLineWidth, highlight_words=highlight_words)
+        whisperLangZho: bool = whisperLang is not None and whisperLang.nllb is not None and whisperLang.nllb.code in ["zho_Hant", "zho_Hans", "yue_Hant"]
+        translationZho: bool = translationModel is not None and translationModel.translationLang is not None and translationModel.translationLang.nllb is not None and translationModel.translationLang.nllb.code in ["zho_Hant", "zho_Hans", "yue_Hant"]
+        if whisperLangZho or translationZho:
+            locale = None
+            if whisperLangZho:
+                if whisperLang.nllb.code == "zho_Hant":
+                    locale = "zh-tw"
+                elif whisperLang.nllb.code == "zho_Hans":
+                    locale = "zh-cn"
+                elif whisperLang.nllb.code == "yue_Hant":
+                    locale = "zh-hk"
+            if translationZho:
+                if translationModel.translationLang.nllb.code == "zho_Hant":
+                    locale = "zh-tw"
+                elif translationModel.translationLang.nllb.code == "zho_Hans":
+                    locale = "zh-cn"
+                elif translationModel.translationLang.nllb.code == "yue_Hant":
+                    locale = "zh-hk"
+            if locale is not None:
+                vtt = zhconv.convert(vtt, locale)
+                srt = zhconv.convert(srt, locale)
+                text = zhconv.convert(text, locale)
+                json_result = zhconv.convert(json_result, locale)
+                if translationModel is not None and translationModel.translationLang is not None:
+                    if srt_original is not None and len(srt_original) > 0:
+                        srt_original = zhconv.convert(srt_original, locale)
+                    if srt_bilingual is not None and len(srt_bilingual) > 0:
+                        srt_bilingual = zhconv.convert(srt_bilingual, locale)
         output_files = []
         output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
         output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
         output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
         output_files.append(self.__create_file(json_result, output_dir, source_name + "-result.json"));
+        if srt_original is not None and len(srt_original) > 0:
+            output_files.append(self.__create_file(srt_original, output_dir, source_name + "-original.srt"));
+        if srt_bilingual is not None and len(srt_bilingual) > 0:
+            output_files.append(self.__create_file(srt_bilingual, output_dir, source_name + "-bilingual.srt"));
         return output_files, text, vtt
             write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
         elif format == 'srt':
             write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
+        elif format == 'srt_original':
+            write_srt_original(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
+        elif format == 'srt_bilingual':
+            write_srt_original(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words, bilingual=True)
         else:
             raise Exception("Unknown format " + format)
             self.diarization = None
 def create_ui(app_config: ApplicationConfig):
+    optionsMd: str = None
+    readmeMd: str = None
+    try:
+        with open("docs\options.md", "r", encoding="utf-8") as optionsFile:
+            optionsMd = optionsFile.read()
+        with open("README.md", "r", encoding="utf-8") as readmeFile:
+            readmeMd = readmeFile.read()
+    except Exception as e:
+        print("Error occurred during read options.md file: ", str(e))
     ui = WhisperTranscriber(app_config.input_audio_max_duration, app_config.vad_process_timeout, app_config.vad_cpu_cores,
                             app_config.delete_uploaded_files, app_config.output_dir, app_config)
         # Try to convert from camel-case to title-case
         implementation_name = app_config.whisper_implementation.title().replace("_", " ").replace("-", " ")
+    uiDescription = implementation_name + " is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
+    uiDescription += " audio and is also a multi-task model that can perform multilingual speech recognition "
+    uiDescription += " as well as speech translation and language identification. "
+    uiDescription += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
     # Recommend faster-whisper
     if is_whisper:
+        uiDescription += "\n\n\n\nFor faster inference on GPU, try [faster-whisper](https://huggingface.co/spaces/aadnk/faster-whisper-webui)."
     if app_config.input_audio_max_duration > 0:
+        uiDescription += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
+    uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
+    uiArticle += "\n\nWhisper's Task 'translate' only implements the functionality of translating other languages into English. "
+    uiArticle += "OpenAI does not guarantee translations between arbitrary languages. In such cases, you can choose to use the NLLB Model to implement the translation task. "
+    uiArticle += "However, it's important to note that the NLLB Model runs slowly, and the completion time may be twice as long as usual. "
+    uiArticle += "\n\nThe larger the parameters of the NLLB model, the better its performance is expected to be. "
+    uiArticle += "However, it also requires higher computational resources, making it slower to operate. "
+    uiArticle += "On the other hand, the version converted from ct2 (CTranslate2) requires lower resources and operates at a faster speed."
+    uiArticle += "\n\nCurrently, enabling word-level timestamps cannot be used in conjunction with NLLB Model translation "
+    uiArticle += "because Word Timestamps will split the source text, and after translation, it becomes a non-word-level string. "
+    uiArticle += "\n\nThe 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. "
+    uiArticle += "This model has a relatively good translation speed, but it only supports three languages: Chinese, Japanese, and English. "
+    whisper_models = app_config.get_model_names("whisper")
+    nllb_models = app_config.get_model_names("nllb")
+    m2m100_models = app_config.get_model_names("m2m100")
+    mt5_models = app_config.get_model_names("mt5")
+    common_whisper_inputs = lambda : {
+        gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
+        gr.Dropdown(label="Whisper - Language", choices=sorted(get_lang_whisper_names()), value=app_config.language, elem_id="whisperLangName"),
+    }
+    common_m2m100_inputs = lambda : {
+        gr.Dropdown(label="M2M100 - Model (for translate)", choices=m2m100_models, elem_id="m2m100ModelName"),
+        gr.Dropdown(label="M2M100 - Language", choices=sorted(get_lang_m2m100_names()), elem_id="m2m100LangName"),
+    }
+    common_nllb_inputs = lambda : {
+        gr.Dropdown(label="NLLB - Model (for translate)", choices=nllb_models, elem_id="nllbModelName"),
+        gr.Dropdown(label="NLLB - Language", choices=sorted(get_lang_nllb_names()), elem_id="nllbLangName"),
+    }
+    common_mt5_inputs = lambda : {
+        gr.Dropdown(label="MT5 - Model (for translate)", choices=mt5_models, elem_id="mt5ModelName"),
+        gr.Dropdown(label="MT5 - Language", choices=sorted(get_lang_m2m100_names(["en", "ja", "zh"])), elem_id="mt5LangName"),
+    }
+    common_translation_inputs = lambda : {
+        gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
+        gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
+        gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams")
+    }
+    common_vad_inputs = lambda : {
+        gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD", elem_id="vad"),
+        gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window, elem_id="vadMergeWindow"),
+        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size, elem_id="vadMaxMergeSize"),
+    }
+    common_word_timestamps_inputs = lambda : {
+        gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps, elem_id="word_timestamps"),
+        gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words, elem_id="highlight_words"),
+    }
     has_diarization_libs = Diarization.has_libraries()
         print("Diarization libraries not found - disabling diarization")
         app_config.diarization = False
+    common_diarization_inputs = lambda : {
+        gr.Checkbox(label="Diarization", value=app_config.diarization, interactive=has_diarization_libs, elem_id="diarization"),
+        gr.Number(label="Diarization - Speakers", precision=0, value=app_config.diarization_speakers, interactive=has_diarization_libs, elem_id="diarization_speakers"),
+        gr.Number(label="Diarization - Min Speakers", precision=0, value=app_config.diarization_min_speakers, interactive=has_diarization_libs, elem_id="diarization_min_speakers"),
+        gr.Number(label="Diarization - Max Speakers", precision=0, value=app_config.diarization_max_speakers, interactive=has_diarization_libs, elem_id="diarization_max_speakers")
+    }
     common_output = lambda : [
         gr.File(label="Download"),
     is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
+    simpleInputDict = {}
+    with gr.Blocks() as simpleTranscribe:
+        simpleTranslateInput = gr.State(value="m2m100", elem_id = "translateInput")
+        simpleSourceInput = gr.State(value="urlData", elem_id = "sourceInput")
+        gr.Markdown(uiDescription)
         with gr.Row():
             with gr.Column():
+                simpleSubmit = gr.Button("Submit", variant="primary")
                 with gr.Column():
                     with gr.Row():
+                        simpleInputDict = common_whisper_inputs()
+                    with gr.Tab(label="M2M100") as simpleM2M100Tab:
+                        with gr.Row():
+                            simpleInputDict.update(common_m2m100_inputs())
+                    with gr.Tab(label="NLLB") as simpleNllbTab:
+                        with gr.Row():
+                            simpleInputDict.update(common_nllb_inputs())
+                    with gr.Tab(label="MT5") as simpleMT5Tab:
+                        with gr.Row():
+                            simpleInputDict.update(common_mt5_inputs())
+                    simpleM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [simpleTranslateInput] )
+                    simpleNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [simpleTranslateInput] )
+                    simpleMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [simpleTranslateInput] )
                 with gr.Column():
+                    with gr.Tab(label="URL") as simpleUrlTab:
+                        simpleInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})
+                    with gr.Tab(label="Upload") as simpleUploadTab:
+                        simpleInputDict.update({gr.File(label="Upload Files", file_count="multiple", elem_id = "multipleFiles")})
+                    with gr.Tab(label="Microphone") as simpleMicTab:
+                        simpleInputDict.update({gr.Audio(source="microphone", type="filepath", label="Microphone Input", elem_id = "microphoneData")})
+                    simpleUrlTab.select(fn=lambda: "urlData", inputs = [], outputs= [simpleSourceInput] )
+                    simpleUploadTab.select(fn=lambda: "multipleFiles", inputs = [], outputs= [simpleSourceInput] )
+                    simpleMicTab.select(fn=lambda: "microphoneData", inputs = [], outputs= [simpleSourceInput] )
+                    simpleInputDict.update({gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task, elem_id = "task")})
+                    with gr.Accordion("VAD options", open=False):
+                        simpleInputDict.update(common_vad_inputs())
+                    with gr.Accordion("Word Timestamps options", open=False):
+                        simpleInputDict.update(common_word_timestamps_inputs())
+                    with gr.Accordion("Diarization options", open=False):
+                        simpleInputDict.update(common_diarization_inputs())
+                    with gr.Accordion("Translation options", open=False):
+                        simpleInputDict.update(common_translation_inputs())
             with gr.Column():
+                simpleOutput = common_output()
+        with gr.Accordion("Article"):
+            gr.Markdown(uiArticle)
+        if optionsMd is not None:
+            with gr.Accordion("docs/options.md", open=False):
+                gr.Markdown(optionsMd)
+        if readmeMd is not None:
+            with gr.Accordion("README.md", open=False):
+                gr.Markdown(readmeMd)
+        simpleInputDict.update({simpleTranslateInput, simpleSourceInput})
+        simpleSubmit.click(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
+                    inputs=simpleInputDict, outputs=simpleOutput)
+    fullInputDict = {}
+    fullDescription = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
+    with gr.Blocks() as fullTranscribe:
+        fullTranslateInput = gr.State(value="m2m100", elem_id = "translateInput")
+        fullSourceInput = gr.State(value="urlData", elem_id = "sourceInput")
+        gr.Markdown(fullDescription)
         with gr.Row():
             with gr.Column():
+                fullSubmit = gr.Button("Submit", variant="primary")
                 with gr.Column():
                     with gr.Row():
+                        fullInputDict = common_whisper_inputs()
+                    with gr.Tab(label="M2M100") as fullM2M100Tab:
+                        with gr.Row():
+                            fullInputDict.update(common_m2m100_inputs())
+                    with gr.Tab(label="NLLB") as fullNllbTab:
+                        with gr.Row():
+                            fullInputDict.update(common_nllb_inputs())
+                    with gr.Tab(label="MT5") as fullMT5Tab:
+                        with gr.Row():
+                            fullInputDict.update(common_mt5_inputs())
+                    fullM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [fullTranslateInput] )
+                    fullNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [fullTranslateInput] )
+                    fullMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [fullTranslateInput] )
                 with gr.Column():
+                    with gr.Tab(label="URL") as fullUrlTab:
+                        fullInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})
+                    with gr.Tab(label="Upload") as fullUploadTab:
+                        fullInputDict.update({gr.File(label="Upload Files", file_count="multiple", elem_id = "multipleFiles")})
+                    with gr.Tab(label="Microphone") as fullMicTab:
+                        fullInputDict.update({gr.Audio(source="microphone", type="filepath", label="Microphone Input", elem_id = "microphoneData")})
+                    fullUrlTab.select(fn=lambda: "urlData", inputs = [], outputs= [fullSourceInput] )
+                    fullUploadTab.select(fn=lambda: "multipleFiles", inputs = [], outputs= [fullSourceInput] )
+                    fullMicTab.select(fn=lambda: "microphoneData", inputs = [], outputs= [fullSourceInput] )
+                    fullInputDict.update({gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task, elem_id = "task")})
+                    with gr.Accordion("VAD options", open=False):
+                        fullInputDict.update(common_vad_inputs())
+                        fullInputDict.update({
+                            gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding, elem_id = "vadPadding"),
+                            gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window, elem_id = "vadPromptWindow"),
+                            gr.Dropdown(choices=VAD_INITIAL_PROMPT_MODE_VALUES, label="VAD - Initial Prompt Mode", value=app_config.vad_initial_prompt_mode, elem_id = "vadInitialPromptMode")})
+                    with gr.Accordion("Word Timestamps options", open=False):
+                        fullInputDict.update(common_word_timestamps_inputs())
+                        fullInputDict.update({
+                            gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations, elem_id = "prepend_punctuations"),
+                            gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations, elem_id = "append_punctuations")})
+                    with gr.Accordion("Whisper Advanced options", open=False):
+                        fullInputDict.update({
+                            gr.TextArea(label="Initial Prompt", elem_id = "initial_prompt"),
+                            gr.Number(label="Temperature", value=app_config.temperature, elem_id = "temperature"),
+                            gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of"),
+                            gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size"),
+                            gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience"),
+                            gr.Number(label="Length Penalty - Any temperature", value=app_config.length_penalty, elem_id = "length_penalty"),
+                            gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens"),
+                            gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text"),
+                            gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16"),
+                            gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback, elem_id = "temperature_increment_on_fallback"),
+                            gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold, elem_id = "compression_ratio_threshold"),
+                            gr.Number(label="Logprob threshold", value=app_config.logprob_threshold, elem_id = "logprob_threshold"),
+                            gr.Number(label="No speech threshold", value=app_config.no_speech_threshold, elem_id = "no_speech_threshold"),
+                            })
+                        if app_config.whisper_implementation == "faster-whisper":
+                            fullInputDict.update({
+                                gr.Number(label="Repetition Penalty", value=app_config.repetition_penalty, elem_id = "repetition_penalty"),
+                                gr.Number(label="No Repeat Ngram Size", value=app_config.no_repeat_ngram_size, precision=0, elem_id = "no_repeat_ngram_size")
+                            })
+                    with gr.Accordion("Diarization options", open=False):
+                        fullInputDict.update(common_diarization_inputs())
+                    with gr.Accordion("Translation options", open=False):
+                        fullInputDict.update(common_translation_inputs())
             with gr.Column():
+                fullOutput = common_output()
+        with gr.Accordion("Article"):
+            gr.Markdown(uiArticle)
+        if optionsMd is not None:
+            with gr.Accordion("docs/options.md", open=False):
+                gr.Markdown(optionsMd)
+        if readmeMd is not None:
+            with gr.Accordion("README.md", open=False):
+                gr.Markdown(readmeMd)
+        fullInputDict.update({fullTranslateInput, fullSourceInput})
+        fullSubmit.click(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
+                    inputs=fullInputDict, outputs=fullOutput)
+    demo = gr.TabbedInterface([simpleTranscribe, fullTranscribe], tab_names=["Simple", "Full"])
     # Queue up the demo
     if is_queue_mode:
 if __name__ == '__main__':
     default_app_config = ApplicationConfig.create_default()
+    whisper_models = default_app_config.get_model_names("whisper")
     # Environment variable overrides
     default_whisper_implementation = os.environ.get("WHISPER_IMPLEMENTATION", default_app_config.whisper_implementation)
                         help="the compute type to use for inference")
     parser.add_argument("--threads", type=optional_int, default=0,
                         help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
     parser.add_argument("--vad_max_merge_size", type=int, default=default_app_config.vad_max_merge_size, \
                         help="The number of VAD - Max Merge Size (s).") # 30
+    parser.add_argument("--language", type=str, default=None, choices=sorted(get_lang_whisper_names()) + sorted([k.title() for k in _TO_LANG_CODE_WHISPER.keys()]),
                         help="language spoken in the audio, specify None to perform language detection")
     parser.add_argument("--save_downloaded_files", action='store_true', \
                         help="True to move downloaded files to outputs directory. This argument will take effect only after output_dir is set.")
                         help="Maximum length of a file name.")
     parser.add_argument("--autolaunch", action='store_true', \
                         help="open the webui URL in the system's default browser upon launch")
     parser.add_argument('--auth_token', type=str, default=default_app_config.auth_token, help='HuggingFace API Token (optional)')
     parser.add_argument("--diarization", type=str2bool, default=default_app_config.diarization, \
                         help="whether to perform speaker diarization")

cli.py CHANGED Viewed

@@ -10,7 +10,7 @@ from app import VadOptions, WhisperTranscriber
 from src.config import VAD_INITIAL_PROMPT_MODE_VALUES, ApplicationConfig, VadInitialPromptMode
 from src.diarization.diarization import Diarization
 from src.download import download_url
-from src.languages import get_language_names
 from src.utils import optional_float, optional_int, str2bool
 from src.whisper.whisperFactory import create_whisper_container
@@ -43,7 +43,7 @@ def cli():
     parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
                         help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
-    parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(get_language_names()), \
                         help="language spoken in the audio, specify None to perform language detection")
     parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \

 from src.config import VAD_INITIAL_PROMPT_MODE_VALUES, ApplicationConfig, VadInitialPromptMode
 from src.diarization.diarization import Diarization
 from src.download import download_url
+from src.translation.translationLangs import get_lang_whisper_names # from src.languages import get_language_names
 from src.utils import optional_float, optional_int, str2bool
 from src.whisper.whisperFactory import create_whisper_container
     parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
                         help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
+    parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(get_lang_whisper_names()), \
                         help="language spoken in the audio, specify None to perform language detection")
     parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \

config.json5 CHANGED Viewed

@@ -1,254 +1,300 @@
 {
-    "models": [
-        // Configuration for the built-in models. You can remove any of these
-        // if you don't want to use the default models.
-        {
-            "name": "tiny",
-            "url": "tiny"
-        },
-        {
-            "name": "base",
-            "url": "base"
-        },
-        {
-            "name": "small",
-            "url": "small"
-        },
-        {
-            "name": "medium",
-            "url": "medium"
-        },
-        {
-            "name": "large",
-            "url": "large"
-        },
-        {
-            "name": "large-v2",
-            "url": "large-v2"
-        },
-        {
-            "name": "large-v3",
-            "url": "large-v3"
-        },
-        // Uncomment to add custom Japanese models
-        //{
-        //    "name": "whisper-large-v2-mix-jp",
-        //    "url": "vumichien/whisper-large-v2-mix-jp",
-        //    // The type of the model. Can be "huggingface" or "whisper" - "whisper" is the default.
-        //    // HuggingFace models are loaded using the HuggingFace transformers library and then converted to Whisper models.
-        //    "type": "huggingface",
-        //},
-        //{
-        //    "name": "local-model",
-        //    "url": "path/to/local/model",
-        //},
-        //{
-        //    "name": "remote-model",
-        //    "url": "https://example.com/path/to/model",
-        //}
     ],
-    "nllb_models": [
-        {
-            "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
-            "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
-            "url": "michaelfeil/ct2fast-nllb-200-3.3B",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-1.3B-ct2:float16/JustFrederik",
-            "url": "JustFrederik/nllb-200-1.3B-ct2-float16",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
-            "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-1.3B-ct2:int8/JustFrederik",
-            "url": "JustFrederik/nllb-200-1.3B-ct2-int8",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
-            "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
-            "type": "huggingface"
-        },
-        {
-            "name": "mt5-zh-ja-en-trimmed/K024",
-            "url": "K024/mt5-zh-ja-en-trimmed",
-            "type": "huggingface"
-        },
-        {
-            "name": "mt5-zh-ja-en-trimmed-fine-tuned-v1/engmatic-earth",
-            "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-distilled-600M/facebook",
-            "url": "facebook/nllb-200-distilled-600M",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-distilled-600M-ct2/JustFrederik",
-            "url": "JustFrederik/nllb-200-distilled-600M-ct2",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-distilled-600M-ct2:float16/JustFrederik",
-            "url": "JustFrederik/nllb-200-distilled-600M-ct2-float16",
-            "type": "huggingface"
-        },
-        {
-            "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
-            "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
-            "type": "huggingface"
-        },
-        // Uncomment to add official Facebook 1.3B and 3.3B model
-        // The official Facebook 1.3B and 3.3B model files are too large,
-        //   and to avoid occupying too much disk space on Hugging Face's free spaces,
-        //   these models are not included in the config.
-        //{
-        //    "name": "nllb-200-distilled-1.3B/facebook",
-        //    "url": "facebook/nllb-200-distilled-1.3B",
-        //    "type": "huggingface"
-        //},
-        //{
-        //    "name": "nllb-200-1.3B/facebook",
-        //    "url": "facebook/nllb-200-1.3B",
-        //    "type": "huggingface"
-        //},
-        //{
-        //    "name": "nllb-200-3.3B/facebook",
-        //    "url": "facebook/nllb-200-3.3B",
-        //    "type": "huggingface"
-        //},
-        //{
-        //    "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
-        //    "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
-        //    "type": "huggingface"
-        //},
-        //{
-        //    "name": "nllb-200-1.3B-ct2/JustFrederik",
-        //    "url": "JustFrederik/nllb-200-1.3B-ct2",
-        //    "type": "huggingface"
-        //},
-        //{
-        //    "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
-        //    "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
-        //    "type": "huggingface"
-        //},
     ],
-    // Configuration options that will be used if they are not specified in the command line arguments.
-    // * WEBUI options *
-    // Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI.
-    "input_audio_max_duration": 1800,
-    // True to share the app on HuggingFace.
-    "share": false,
-    // The host or IP to bind to. If None, bind to localhost.
-    "server_name": null,
-    // The port to bind to.
-    "server_port": 7860,
-    // The number of workers to use for the web server. Use -1 to disable queueing.
-    "queue_concurrency_count": 1,
-    // Whether or not to automatically delete all uploaded files, to save disk space
-    "delete_uploaded_files": true,
-    // * General options *
-    // The default implementation to use for Whisper. Can be "whisper" or "faster-whisper".
-    // Note that you must either install the requirements for faster-whisper (requirements-fasterWhisper.txt)
-    // or whisper (requirements.txt)
-    "whisper_implementation": "faster-whisper",
-    // The default model name.
-    "default_model_name": "large-v2",
-    // The default VAD.
-    "default_vad": "silero-vad",
-    // A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.
-    "vad_parallel_devices": "",
-    // The number of CPU cores to use for VAD pre-processing.
-    "vad_cpu_cores": 1,
-    // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
-    "vad_process_timeout": 1800,
-    // True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
-    "auto_parallel": false,
-    // Directory to save the outputs (CLI will use the current directory if not specified)
-    "output_dir": null,
-    // The path to save model files; uses ~/.cache/whisper by default
-    "model_dir": null,
-    // Device to use for PyTorch inference, or Null to use the default device
-    "device": null,
-    // Whether to print out the progress and debug messages
-    "verbose": true,
-    // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
-    "task": "transcribe",
-    // Language spoken in the audio, specify None to perform language detection
-    "language": null,
-    // The window size (in seconds) to merge voice segments
-    "vad_merge_window": 5,
-    // The maximum size (in seconds) of a voice segment
-    "vad_max_merge_size": 90,
-    // The padding (in seconds) to add to each voice segment
-    "vad_padding": 1,
-    // Whether or not to prepend the initial prompt to each VAD segment (prepend_all_segments), or just the first segment (prepend_first_segment)
-    "vad_initial_prompt_mode": "prepend_first_segment",
-    // The window size of the prompt to pass to Whisper
-    "vad_prompt_window": 3,
-    // Temperature to use for sampling
-    "temperature": 0,
-    // Number of candidates when sampling with non-zero temperature
-    "best_of": 5,
-    // Number of beams in beam search, only applicable when temperature is zero
-    "beam_size": 5,
-    // Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search
-    "patience": 1,
-    // Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default
-    "length_penalty": null,
-    // Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations
-    "suppress_tokens": "-1",
-    // Optional text to provide as a prompt for the first window
-    "initial_prompt": null,
-    // If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
-    "condition_on_previous_text": true,
-    // Whether to perform inference in fp16; True by default
-    "fp16": true,
-    // The compute type used by faster-whisper. Can be "int8". "int16" or "float16".
-    "compute_type": "auto",
-    // Temperature to increase when falling back when the decoding fails to meet either of the thresholds below
-    "temperature_increment_on_fallback": 0.2,
-    // If the gzip compression ratio is higher than this value, treat the decoding as failed
-    "compression_ratio_threshold": 2.4,
-    // If the average log probability is lower than this value, treat the decoding as failed
-    "logprob_threshold": -1.0,
-    // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
-    "no_speech_threshold": 0.6,
-    // (experimental) extract word-level timestamps and refine the results based on them
-    "word_timestamps": false,
-    // if word_timestamps is True, merge these punctuation symbols with the next word
-    "prepend_punctuations": "\"\'“¿([{-",
-    // if word_timestamps is True, merge these punctuation symbols with the previous word
-    "append_punctuations": "\"\'.。,，!！?？:：”)]}、",
-    // (requires --word_timestamps True) underline each word as it is spoken in srt and vtt
-    "highlight_words": false,
-    // Diarization settings
-    "auth_token": null,
-    // Whether to perform speaker diarization
-    "diarization": false,
-    // The number of speakers to detect
-    "diarization_speakers": 2,
-    // The minimum number of speakers to detect
-    "diarization_min_speakers": 1,
-    // The maximum number of speakers to detect
-    "diarization_max_speakers": 8,
-    // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
-    "diarization_process_timeout": 60,
 }

 {
+  "models": {
+    "whisper": [
+      // Configuration for the built-in models. You can remove any of these
+      // if you don't want to use the default models.
+      {
+        "name": "tiny",
+        "url": "tiny"
+      },
+      {
+        "name": "base",
+        "url": "base"
+      },
+      {
+        "name": "small",
+        "url": "small"
+      },
+      {
+        "name": "medium",
+        "url": "medium"
+      },
+      {
+        "name": "large",
+        "url": "large"
+      },
+      {
+        "name": "large-v2",
+        "url": "large-v2"
+      },
+      {
+        "name": "large-v3",
+        "url": "large-v3"
+      }
+      // Uncomment to add custom Japanese models
+      //{
+      //  "name": "whisper-large-v2-mix-jp",
+      //  "url": "vumichien/whisper-large-v2-mix-jp",
+      //  // The type of the model. Can be "huggingface" or "whisper" - "whisper" is the default.
+      //  // HuggingFace models are loaded using the HuggingFace transformers library and then converted to Whisper models.
+      //  "type": "huggingface",
+      //},
+      //{
+      //  "name": "local-model",
+      //  "url": "path/to/local/model",
+      //},
+      //{
+      //  "name": "remote-model",
+      //  "url": "https://example.com/path/to/model",
+      //}
     ],
+    "m2m100": [
+      {
+        "name": "m2m100_1.2B-ct2fast/michaelfeil",
+        "url": "michaelfeil/ct2fast-m2m100_1.2B",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/m2m100_1.2B"
+      },
+      {
+        "name": "m2m100_418M-ct2fast/michaelfeil",
+        "url": "michaelfeil/ct2fast-m2m100_418M",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/m2m100_418M"
+      },
+      //{
+      //  "name": "m2m100-12B-ct2fast/michaelfeil",
+      //  "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
+      //  "type": "huggingface",
+      //  "tokenizer_url": "facebook/m2m100-12B-last-ckpt"
+      //},
+      {
+        "name": "m2m100_1.2B/facebook",
+        "url": "facebook/m2m100_1.2B",
+        "type": "huggingface"
+      },
+      {
+        "name": "m2m100_418M/facebook",
+        "url": "facebook/m2m100_418M",
+        "type": "huggingface"
+      }
     ],
+    "nllb": [
+      {
+        "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
+        "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
+      },
+      {
+        "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
+        "url": "michaelfeil/ct2fast-nllb-200-3.3B",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-3.3B"
+      },
+      {
+        "name": "nllb-200-1.3B-ct2:float16/JustFrederik",
+        "url": "JustFrederik/nllb-200-1.3B-ct2-float16",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-1.3B"
+      },
+      {
+        "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
+      },
+      {
+        "name": "nllb-200-1.3B-ct2:int8/JustFrederik",
+        "url": "JustFrederik/nllb-200-1.3B-ct2-int8",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-1.3B"
+      },
+      {
+        "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
+      },
+      {
+        "name": "nllb-200-distilled-600M/facebook",
+        "url": "facebook/nllb-200-distilled-600M",
+        "type": "huggingface"
+      },
+      {
+        "name": "nllb-200-distilled-600M-ct2/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-600M-ct2",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-600M"
+      },
+      {
+        "name": "nllb-200-distilled-600M-ct2:float16/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-600M-ct2-float16",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-600M"
+      },
+      {
+        "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-600M"
+      }
+      // Uncomment to add official Facebook 1.3B and 3.3B model
+      // The official Facebook 1.3B and 3.3B model files are too large,
+      //   and to avoid occupying too much disk space on Hugging Face's free spaces,
+      //   these models are not included in the config.
+      //{
+      //  "name": "nllb-200-distilled-1.3B/facebook",
+      //  "url": "facebook/nllb-200-distilled-1.3B",
+      //  "type": "huggingface"
+      //},
+      //{
+      //  "name": "nllb-200-1.3B/facebook",
+      //  "url": "facebook/nllb-200-1.3B",
+      //  "type": "huggingface"
+      //},
+      //{
+      //  "name": "nllb-200-3.3B/facebook",
+      //  "url": "facebook/nllb-200-3.3B",
+      //  "type": "huggingface"
+      //},
+      //{
+      //  "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
+      //  "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
+      //  "type": "huggingface",
+      //  "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
+      //},
+      //{
+      //  "name": "nllb-200-1.3B-ct2/JustFrederik",
+      //  "url": "JustFrederik/nllb-200-1.3B-ct2",
+      //  "type": "huggingface",
+      //  "tokenizer_url": "facebook/nllb-200-1.3B"
+      //},
+      //{
+      //  "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
+      //  "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
+      //  "type": "huggingface",
+      //  "tokenizer_url": "facebook/nllb-200-3.3B"
+      //},
+    ],
+    "mt5": [
+      {
+        "name": "mt5-zh-ja-en-trimmed/K024",
+        "url": "K024/mt5-zh-ja-en-trimmed",
+        "type": "huggingface"
+      },
+      {
+        "name": "mt5-zh-ja-en-trimmed-fine-tuned-v1/engmatic-earth",
+        "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
+        "type": "huggingface"
+      }
+    ]
+  },
+  // Configuration options that will be used if they are not specified in the command line arguments.
+  // * WEBUI options *
+  // Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI.
+  "input_audio_max_duration": 1800,
+  // True to share the app on HuggingFace.
+  "share": false,
+  // The host or IP to bind to. If None, bind to localhost.
+  "server_name": null,
+  // The port to bind to.
+  "server_port": 7860,
+  // The number of workers to use for the web server. Use -1 to disable queueing.
+  "queue_concurrency_count": 1,
+  // Whether or not to automatically delete all uploaded files, to save disk space
+  "delete_uploaded_files": true,
+  // * General options *
+  // The default implementation to use for Whisper. Can be "whisper" or "faster-whisper".
+  // Note that you must either install the requirements for faster-whisper (requirements-fasterWhisper.txt)
+  // or whisper (requirements.txt)
+  "whisper_implementation": "faster-whisper",
+  // The default model name.
+  "default_model_name": "large-v2",
+  // The default VAD.
+  "default_vad": "silero-vad",
+  // A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.
+  "vad_parallel_devices": "",
+  // The number of CPU cores to use for VAD pre-processing.
+  "vad_cpu_cores": 1,
+  // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
+  "vad_process_timeout": 1800,
+  // True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
+  "auto_parallel": false,
+  // Directory to save the outputs (CLI will use the current directory if not specified)
+  "output_dir": null,
+  // The path to save model files; uses ~/.cache/whisper by default
+  "model_dir": null,
+  // Device to use for PyTorch inference, or Null to use the default device
+  "device": null,
+  // Whether to print out the progress and debug messages
+  "verbose": true,
+  // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
+  "task": "transcribe",
+  // Language spoken in the audio, specify None to perform language detection
+  "language": null,
+  // The window size (in seconds) to merge voice segments
+  "vad_merge_window": 5,
+  // The maximum size (in seconds) of a voice segment
+  "vad_max_merge_size": 90,
+  // The padding (in seconds) to add to each voice segment
+  "vad_padding": 1,
+  // Whether or not to prepend the initial prompt to each VAD segment (prepend_all_segments), or just the first segment (prepend_first_segment)
+  "vad_initial_prompt_mode": "prepend_first_segment",
+  // The window size of the prompt to pass to Whisper
+  "vad_prompt_window": 3,
+  // Temperature to use for sampling
+  "temperature": 0,
+  // Number of candidates when sampling with non-zero temperature
+  "best_of": 5,
+  // Number of beams in beam search, only applicable when temperature is zero
+  "beam_size": 5,
+  // Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search
+  "patience": 1,
+  // Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default
+  "length_penalty": null,
+  // Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations
+  "suppress_tokens": "-1",
+  // Optional text to provide as a prompt for the first window
+  "initial_prompt": null,
+  // If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
+  "condition_on_previous_text": true,
+  // Whether to perform inference in fp16; True by default
+  "fp16": true,
+  // The compute type used by faster-whisper. Can be "int8". "int16" or "float16".
+  "compute_type": "auto",
+  // Temperature to increase when falling back when the decoding fails to meet either of the thresholds below
+  "temperature_increment_on_fallback": 0.2,
+  // If the gzip compression ratio is higher than this value, treat the decoding as failed
+  "compression_ratio_threshold": 2.4,
+  // If the average log probability is lower than this value, treat the decoding as failed
+  "logprob_threshold": -1.0,
+  // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
+  "no_speech_threshold": 0.6,
+  // (experimental) extract word-level timestamps and refine the results based on them
+  "word_timestamps": false,
+  // if word_timestamps is True, merge these punctuation symbols with the next word
+  "prepend_punctuations": "\"\'“¿([{-",
+  // if word_timestamps is True, merge these punctuation symbols with the previous word
+  "append_punctuations": "\"\'.。,，!！?？:：”)]}、",
+  // (requires --word_timestamps True) underline each word as it is spoken in srt and vtt
+  "highlight_words": false,
+  // Diarization settings
+  "auth_token": null,
+  // Whether to perform speaker diarization
+  "diarization": false,
+  // The number of speakers to detect
+  "diarization_speakers": 2,
+  // The minimum number of speakers to detect
+  "diarization_min_speakers": 1,
+  // The maximum number of speakers to detect
+  "diarization_max_speakers": 8,
+  // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
+  "diarization_process_timeout": 60,
 }

requirements-whisper.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 git+https://github.com/huggingface/transformers
 git+https://github.com/openai/whisper.git
-transformers
 ffmpeg-python==0.2.0
 gradio==3.50.2
 yt-dlp

 git+https://github.com/huggingface/transformers
 git+https://github.com/openai/whisper.git
 ffmpeg-python==0.2.0
 gradio==3.50.2
 yt-dlp

src/config.py CHANGED Viewed

@@ -1,16 +1,11 @@
 from enum import Enum
-import urllib
 import os
-from typing import List
-from urllib.parse import urlparse
-import json5
-import torch
-from tqdm import tqdm
 class ModelConfig:
-    def __init__(self, name: str, url: str, path: str = None, type: str = "whisper"):
         """
         Initialize a model configuration.
@@ -23,6 +18,7 @@ class ModelConfig:
         self.url = url
         self.path = path
         self.type = type
 VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]
@@ -33,7 +29,7 @@ class VadInitialPromptMode(Enum):
     @staticmethod
     def from_string(s: str):
-        normalized = s.lower() if s is not None else None
         if normalized == "prepend_all_segments":
             return VadInitialPromptMode.PREPEND_ALL_SEGMENTS
@@ -47,11 +43,11 @@ class VadInitialPromptMode(Enum):
             return None
 class ApplicationConfig:
-    def __init__(self, models: List[ModelConfig] = [], nllb_models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
-                 share: bool = False, server_name: str = None, server_port: int = 7860,
                  queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
-                 whisper_implementation: str = "whisper",
-                 default_model_name: str = "medium", default_nllb_model_name: str = "distilled-600M", default_vad: str = "silero-vad",
                  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
                  auto_parallel: bool = False, output_dir: str = None,
                  model_dir: str = None, device: str = None,
@@ -66,6 +62,7 @@ class ApplicationConfig:
                  compute_type: str = "float16",
                  temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
                  logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6,
                  # Word timestamp settings
                  word_timestamps: bool = False, prepend_punctuations: str = "\"\'“¿([{-",
                  append_punctuations: str = "\"\'.。,，!！?？:：”)]}、",
@@ -73,10 +70,14 @@ class ApplicationConfig:
                  # Diarization
                  auth_token: str = None, diarization: bool = False, diarization_speakers: int = 2,
                  diarization_min_speakers: int = 1, diarization_max_speakers: int = 5,
-                 diarization_process_timeout: int = 60):
         self.models = models
-        self.nllb_models = nllb_models
         # WebUI settings
         self.input_audio_max_duration = input_audio_max_duration
@@ -120,6 +121,8 @@ class ApplicationConfig:
         self.compression_ratio_threshold = compression_ratio_threshold
         self.logprob_threshold = logprob_threshold
         self.no_speech_threshold = no_speech_threshold
         # Word timestamp settings
         self.word_timestamps = word_timestamps
@@ -134,12 +137,13 @@ class ApplicationConfig:
         self.diarization_min_speakers = diarization_min_speakers
         self.diarization_max_speakers = diarization_max_speakers
         self.diarization_process_timeout = diarization_process_timeout
-    def get_model_names(self):
-        return [ x.name for x in self.models ]
-    def get_nllb_model_names(self):
-        return [ x.name for x in self.nllb_models ]
     def update(self, **new_values):
         result = ApplicationConfig(**self.__dict__)
@@ -165,9 +169,9 @@ class ApplicationConfig:
             # Load using json5
             data = json5.load(f)
             data_models = data.pop("models", [])
-            data_nllb_models = data.pop("nllb_models", [])
-            models = [ ModelConfig(**x) for x in data_models ]
-            nllb_models = [ ModelConfig(**x) for x in data_nllb_models ]
-            return ApplicationConfig(models, nllb_models, **data)

 from enum import Enum
 import os
+from typing import List, Dict, Literal
 class ModelConfig:
+    def __init__(self, name: str, url: str, path: str = None, type: str = "whisper", tokenizer_url: str = None):
         """
         Initialize a model configuration.
         self.url = url
         self.path = path
         self.type = type
+        self.tokenizer_url = tokenizer_url
 VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]
     @staticmethod
     def from_string(s: str):
+        normalized = s.lower() if s is not None and len(s) > 0 else None
         if normalized == "prepend_all_segments":
             return VadInitialPromptMode.PREPEND_ALL_SEGMENTS
             return None
 class ApplicationConfig:
+    def __init__(self, models: Dict[Literal["whisper", "m2m100", "nllb", "mt5"], List[ModelConfig]],
+                 input_audio_max_duration: int = 600, share: bool = False, server_name: str = None, server_port: int = 7860,
                  queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
+                 whisper_implementation: str = "whisper", default_model_name: str = "medium",
+                 default_nllb_model_name: str = "distilled-600M", default_vad: str = "silero-vad",
                  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
                  auto_parallel: bool = False, output_dir: str = None,
                  model_dir: str = None, device: str = None,
                  compute_type: str = "float16",
                  temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
                  logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6,
+                 repetition_penalty: float = 1.0, no_repeat_ngram_size: int = 0,
                  # Word timestamp settings
                  word_timestamps: bool = False, prepend_punctuations: str = "\"\'“¿([{-",
                  append_punctuations: str = "\"\'.。,，!！?？:：”)]}、",
                  # Diarization
                  auth_token: str = None, diarization: bool = False, diarization_speakers: int = 2,
                  diarization_min_speakers: int = 1, diarization_max_speakers: int = 5,
+                 diarization_process_timeout: int = 60,
+                 # Translation
+                 translation_batch_size: int = 2,
+                 translation_no_repeat_ngram_size: int = 3,
+                 translation_num_beams: int = 2,
+                 ):
         self.models = models
         # WebUI settings
         self.input_audio_max_duration = input_audio_max_duration
         self.compression_ratio_threshold = compression_ratio_threshold
         self.logprob_threshold = logprob_threshold
         self.no_speech_threshold = no_speech_threshold
+        self.repetition_penalty = repetition_penalty
+        self.no_repeat_ngram_size = no_repeat_ngram_size
         # Word timestamp settings
         self.word_timestamps = word_timestamps
         self.diarization_min_speakers = diarization_min_speakers
         self.diarization_max_speakers = diarization_max_speakers
         self.diarization_process_timeout = diarization_process_timeout
+        # Translation
+        self.translation_batch_size = translation_batch_size
+        self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
+        self.translation_num_beams = translation_num_beams
+    def get_model_names(self, name: str):
+        return [ x.name for x in self.models[name] ]
     def update(self, **new_values):
         result = ApplicationConfig(**self.__dict__)
             # Load using json5
             data = json5.load(f)
             data_models = data.pop("models", [])
+            models: Dict[Literal["whisper", "m2m100", "nllb", "mt5"], List[ModelConfig]] = {
+                key: [ModelConfig(**item) for item in value]
+                for key, value in data_models.items()
+            }
+            return ApplicationConfig(models, **data)

src/languages.py DELETED Viewed

@@ -1,147 +0,0 @@
-class Language():
-    def __init__(self, code, name):
-        self.code = code
-        self.name = name
-    def __str__(self):
-        return "Language(code={}, name={})".format(self.code, self.name)
-LANGUAGES = [
-    Language('en', 'English'),
-    Language('zh', 'Chinese'),
-    Language('de', 'German'),
-    Language('es', 'Spanish'),
-    Language('ru', 'Russian'),
-    Language('ko', 'Korean'),
-    Language('fr', 'French'),
-    Language('ja', 'Japanese'),
-    Language('pt', 'Portuguese'),
-    Language('tr', 'Turkish'),
-    Language('pl', 'Polish'),
-    Language('ca', 'Catalan'),
-    Language('nl', 'Dutch'),
-    Language('ar', 'Arabic'),
-    Language('sv', 'Swedish'),
-    Language('it', 'Italian'),
-    Language('id', 'Indonesian'),
-    Language('hi', 'Hindi'),
-    Language('fi', 'Finnish'),
-    Language('vi', 'Vietnamese'),
-    Language('he', 'Hebrew'),
-    Language('uk', 'Ukrainian'),
-    Language('el', 'Greek'),
-    Language('ms', 'Malay'),
-    Language('cs', 'Czech'),
-    Language('ro', 'Romanian'),
-    Language('da', 'Danish'),
-    Language('hu', 'Hungarian'),
-    Language('ta', 'Tamil'),
-    Language('no', 'Norwegian'),
-    Language('th', 'Thai'),
-    Language('ur', 'Urdu'),
-    Language('hr', 'Croatian'),
-    Language('bg', 'Bulgarian'),
-    Language('lt', 'Lithuanian'),
-    Language('la', 'Latin'),
-    Language('mi', 'Maori'),
-    Language('ml', 'Malayalam'),
-    Language('cy', 'Welsh'),
-    Language('sk', 'Slovak'),
-    Language('te', 'Telugu'),
-    Language('fa', 'Persian'),
-    Language('lv', 'Latvian'),
-    Language('bn', 'Bengali'),
-    Language('sr', 'Serbian'),
-    Language('az', 'Azerbaijani'),
-    Language('sl', 'Slovenian'),
-    Language('kn', 'Kannada'),
-    Language('et', 'Estonian'),
-    Language('mk', 'Macedonian'),
-    Language('br', 'Breton'),
-    Language('eu', 'Basque'),
-    Language('is', 'Icelandic'),
-    Language('hy', 'Armenian'),
-    Language('ne', 'Nepali'),
-    Language('mn', 'Mongolian'),
-    Language('bs', 'Bosnian'),
-    Language('kk', 'Kazakh'),
-    Language('sq', 'Albanian'),
-    Language('sw', 'Swahili'),
-    Language('gl', 'Galician'),
-    Language('mr', 'Marathi'),
-    Language('pa', 'Punjabi'),
-    Language('si', 'Sinhala'),
-    Language('km', 'Khmer'),
-    Language('sn', 'Shona'),
-    Language('yo', 'Yoruba'),
-    Language('so', 'Somali'),
-    Language('af', 'Afrikaans'),
-    Language('oc', 'Occitan'),
-    Language('ka', 'Georgian'),
-    Language('be', 'Belarusian'),
-    Language('tg', 'Tajik'),
-    Language('sd', 'Sindhi'),
-    Language('gu', 'Gujarati'),
-    Language('am', 'Amharic'),
-    Language('yi', 'Yiddish'),
-    Language('lo', 'Lao'),
-    Language('uz', 'Uzbek'),
-    Language('fo', 'Faroese'),
-    Language('ht', 'Haitian creole'),
-    Language('ps', 'Pashto'),
-    Language('tk', 'Turkmen'),
-    Language('nn', 'Nynorsk'),
-    Language('mt', 'Maltese'),
-    Language('sa', 'Sanskrit'),
-    Language('lb', 'Luxembourgish'),
-    Language('my', 'Myanmar'),
-    Language('bo', 'Tibetan'),
-    Language('tl', 'Tagalog'),
-    Language('mg', 'Malagasy'),
-    Language('as', 'Assamese'),
-    Language('tt', 'Tatar'),
-    Language('haw', 'Hawaiian'),
-    Language('ln', 'Lingala'),
-    Language('ha', 'Hausa'),
-    Language('ba', 'Bashkir'),
-    Language('jw', 'Javanese'),
-    Language('su', 'Sundanese')
-]
-_TO_LANGUAGE_CODE = {
-    **{language.code: language for language in LANGUAGES},
-    "burmese": "my",
-    "valencian": "ca",
-    "flemish": "nl",
-    "haitian": "ht",
-    "letzeburgesch": "lb",
-    "pushto": "ps",
-    "panjabi": "pa",
-    "moldavian": "ro",
-    "moldovan": "ro",
-    "sinhalese": "si",
-    "castilian": "es",
-}
-_FROM_LANGUAGE_NAME = {
-    **{language.name.lower(): language for language in LANGUAGES}
-}
-def get_language_from_code(language_code, default=None) -> Language:
-    """Return the language name from the language code."""
-    return _TO_LANGUAGE_CODE.get(language_code, default)
-def get_language_from_name(language, default=None) -> Language:
-    """Return the language code from the language name."""
-    return _FROM_LANGUAGE_NAME.get(language.lower() if language else None, default)
-def get_language_names():
-    """Return a list of language names."""
-    return [language.name for language in LANGUAGES]
-if __name__ == "__main__":
-    # Test lookup
-    print(get_language_from_code('en'))
-    print(get_language_from_name('English'))
-    print(get_language_names())

src/nllb/nllbLangs.py DELETED Viewed

@@ -1,251 +0,0 @@
-class NllbLang():
-    def __init__(self, code, name, code_whisper=None, name_whisper=None):
-        self.code = code
-        self.name = name
-        self.code_whisper = code_whisper
-        self.name_whisper = name_whisper
-    def __str__(self):
-        return "Language(code={}, name={})".format(self.code, self.name)
-NLLB_LANGS = [
-    NllbLang('ace_Arab', 'Acehnese (Arabic script)'),
-    NllbLang('ace_Latn', 'Acehnese (Latin script)'),
-    NllbLang('acm_Arab', 'Mesopotamian Arabic', 'ar', 'Arabic'),
-    NllbLang('acq_Arab', 'Ta’izzi-Adeni Arabic', 'ar', 'Arabic'),
-    NllbLang('aeb_Arab', 'Tunisian Arabic'),
-    NllbLang('afr_Latn', 'Afrikaans', 'am', 'Amharic'),
-    NllbLang('ajp_Arab', 'South Levantine Arabic', 'ar', 'Arabic'),
-    NllbLang('aka_Latn', 'Akan'),
-    NllbLang('amh_Ethi', 'Amharic'),
-    NllbLang('apc_Arab', 'North Levantine Arabic', 'ar', 'Arabic'),
-    NllbLang('arb_Arab', 'Modern Standard Arabic', 'ar', 'Arabic'),
-    NllbLang('arb_Latn', 'Modern Standard Arabic (Romanized)'),
-    NllbLang('ars_Arab', 'Najdi Arabic', 'ar', 'Arabic'),
-    NllbLang('ary_Arab', 'Moroccan Arabic', 'ar', 'Arabic'),
-    NllbLang('arz_Arab', 'Egyptian Arabic', 'ar', 'Arabic'),
-    NllbLang('asm_Beng', 'Assamese', 'as', 'Assamese'),
-    NllbLang('ast_Latn', 'Asturian'),
-    NllbLang('awa_Deva', 'Awadhi'),
-    NllbLang('ayr_Latn', 'Central Aymara'),
-    NllbLang('azb_Arab', 'South Azerbaijani', 'az', 'Azerbaijani'),
-    NllbLang('azj_Latn', 'North Azerbaijani', 'az', 'Azerbaijani'),
-    NllbLang('bak_Cyrl', 'Bashkir', 'ba', 'Bashkir'),
-    NllbLang('bam_Latn', 'Bambara'),
-    NllbLang('ban_Latn', 'Balinese'),
-    NllbLang('bel_Cyrl', 'Belarusian', 'be', 'Belarusian'),
-    NllbLang('bem_Latn', 'Bemba'),
-    NllbLang('ben_Beng', 'Bengali', 'bn', 'Bengali'),
-    NllbLang('bho_Deva', 'Bhojpuri'),
-    NllbLang('bjn_Arab', 'Banjar (Arabic script)'),
-    NllbLang('bjn_Latn', 'Banjar (Latin script)'),
-    NllbLang('bod_Tibt', 'Standard Tibetan', 'bo', 'Tibetan'),
-    NllbLang('bos_Latn', 'Bosnian', 'bs', 'Bosnian'),
-    NllbLang('bug_Latn', 'Buginese'),
-    NllbLang('bul_Cyrl', 'Bulgarian', 'bg', 'Bulgarian'),
-    NllbLang('cat_Latn', 'Catalan', 'ca', 'Catalan'),
-    NllbLang('ceb_Latn', 'Cebuano'),
-    NllbLang('ces_Latn', 'Czech', 'cs', 'Czech'),
-    NllbLang('cjk_Latn', 'Chokwe'),
-    NllbLang('ckb_Arab', 'Central Kurdish'),
-    NllbLang('crh_Latn', 'Crimean Tatar'),
-    NllbLang('cym_Latn', 'Welsh', 'cy', 'Welsh'),
-    NllbLang('dan_Latn', 'Danish', 'da', 'Danish'),
-    NllbLang('deu_Latn', 'German', 'de', 'German'),
-    NllbLang('dik_Latn', 'Southwestern Dinka'),
-    NllbLang('dyu_Latn', 'Dyula'),
-    NllbLang('dzo_Tibt', 'Dzongkha'),
-    NllbLang('ell_Grek', 'Greek', 'el', 'Greek'),
-    NllbLang('eng_Latn', 'English', 'en', 'English'),
-    NllbLang('epo_Latn', 'Esperanto'),
-    NllbLang('est_Latn', 'Estonian', 'et', 'Estonian'),
-    NllbLang('eus_Latn', 'Basque', 'eu', 'Basque'),
-    NllbLang('ewe_Latn', 'Ewe'),
-    NllbLang('fao_Latn', 'Faroese', 'fo', 'Faroese'),
-    NllbLang('fij_Latn', 'Fijian'),
-    NllbLang('fin_Latn', 'Finnish', 'fi', 'Finnish'),
-    NllbLang('fon_Latn', 'Fon'),
-    NllbLang('fra_Latn', 'French', 'fr', 'French'),
-    NllbLang('fur_Latn', 'Friulian'),
-    NllbLang('fuv_Latn', 'Nigerian Fulfulde'),
-    NllbLang('gla_Latn', 'Scottish Gaelic'),
-    NllbLang('gle_Latn', 'Irish'),
-    NllbLang('glg_Latn', 'Galician', 'gl', 'Galician'),
-    NllbLang('grn_Latn', 'Guarani'),
-    NllbLang('guj_Gujr', 'Gujarati', 'gu', 'Gujarati'),
-    NllbLang('hat_Latn', 'Haitian Creole', 'ht', 'Haitian creole'),
-    NllbLang('hau_Latn', 'Hausa', 'ha', 'Hausa'),
-    NllbLang('heb_Hebr', 'Hebrew', 'he', 'Hebrew'),
-    NllbLang('hin_Deva', 'Hindi', 'hi', 'Hindi'),
-    NllbLang('hne_Deva', 'Chhattisgarhi'),
-    NllbLang('hrv_Latn', 'Croatian', 'hr', 'Croatian'),
-    NllbLang('hun_Latn', 'Hungarian', 'hu', 'Hungarian'),
-    NllbLang('hye_Armn', 'Armenian', 'hy', 'Armenian'),
-    NllbLang('ibo_Latn', 'Igbo'),
-    NllbLang('ilo_Latn', 'Ilocano'),
-    NllbLang('ind_Latn', 'Indonesian', 'id', 'Indonesian'),
-    NllbLang('isl_Latn', 'Icelandic', 'is', 'Icelandic'),
-    NllbLang('ita_Latn', 'Italian', 'it', 'Italian'),
-    NllbLang('jav_Latn', 'Javanese', 'jw', 'Javanese'),
-    NllbLang('jpn_Jpan', 'Japanese', 'ja', 'Japanese'),
-    NllbLang('kab_Latn', 'Kabyle'),
-    NllbLang('kac_Latn', 'Jingpho'),
-    NllbLang('kam_Latn', 'Kamba'),
-    NllbLang('kan_Knda', 'Kannada', 'kn', 'Kannada'),
-    NllbLang('kas_Arab', 'Kashmiri (Arabic script)'),
-    NllbLang('kas_Deva', 'Kashmiri (Devanagari script)'),
-    NllbLang('kat_Geor', 'Georgian', 'ka', 'Georgian'),
-    NllbLang('knc_Arab', 'Central Kanuri (Arabic script)'),
-    NllbLang('knc_Latn', 'Central Kanuri (Latin script)'),
-    NllbLang('kaz_Cyrl', 'Kazakh', 'kk', 'Kazakh'),
-    NllbLang('kbp_Latn', 'Kabiyè'),
-    NllbLang('kea_Latn', 'Kabuverdianu'),
-    NllbLang('khm_Khmr', 'Khmer', 'km', 'Khmer'),
-    NllbLang('kik_Latn', 'Kikuyu'),
-    NllbLang('kin_Latn', 'Kinyarwanda'),
-    NllbLang('kir_Cyrl', 'Kyrgyz'),
-    NllbLang('kmb_Latn', 'Kimbundu'),
-    NllbLang('kmr_Latn', 'Northern Kurdish'),
-    NllbLang('kon_Latn', 'Kikongo'),
-    NllbLang('kor_Hang', 'Korean', 'ko', 'Korean'),
-    NllbLang('lao_Laoo', 'Lao', 'lo', 'Lao'),
-    NllbLang('lij_Latn', 'Ligurian'),
-    NllbLang('lim_Latn', 'Limburgish'),
-    NllbLang('lin_Latn', 'Lingala', 'ln', 'Lingala'),
-    NllbLang('lit_Latn', 'Lithuanian', 'lt', 'Lithuanian'),
-    NllbLang('lmo_Latn', 'Lombard'),
-    NllbLang('ltg_Latn', 'Latgalian'),
-    NllbLang('ltz_Latn', 'Luxembourgish', 'lb', 'Luxembourgish'),
-    NllbLang('lua_Latn', 'Luba-Kasai'),
-    NllbLang('lug_Latn', 'Ganda'),
-    NllbLang('luo_Latn', 'Luo'),
-    NllbLang('lus_Latn', 'Mizo'),
-    NllbLang('lvs_Latn', 'Standard Latvian', 'lv', 'Latvian'),
-    NllbLang('mag_Deva', 'Magahi'),
-    NllbLang('mai_Deva', 'Maithili'),
-    NllbLang('mal_Mlym', 'Malayalam', 'ml', 'Malayalam'),
-    NllbLang('mar_Deva', 'Marathi', 'mr', 'Marathi'),
-    NllbLang('min_Arab', 'Minangkabau (Arabic script)'),
-    NllbLang('min_Latn', 'Minangkabau (Latin script)'),
-    NllbLang('mkd_Cyrl', 'Macedonian', 'mk', 'Macedonian'),
-    NllbLang('plt_Latn', 'Plateau Malagasy', 'mg', 'Malagasy'),
-    NllbLang('mlt_Latn', 'Maltese', 'mt', 'Maltese'),
-    NllbLang('mni_Beng', 'Meitei (Bengali script)'),
-    NllbLang('khk_Cyrl', 'Halh Mongolian', 'mn', 'Mongolian'),
-    NllbLang('mos_Latn', 'Mossi'),
-    NllbLang('mri_Latn', 'Maori', 'mi', 'Maori'),
-    NllbLang('mya_Mymr', 'Burmese', 'my', 'Myanmar'),
-    NllbLang('nld_Latn', 'Dutch', 'nl', 'Dutch'),
-    NllbLang('nno_Latn', 'Norwegian Nynorsk', 'nn', 'Nynorsk'),
-    NllbLang('nob_Latn', 'Norwegian Bokmål', 'no', 'Norwegian'),
-    NllbLang('npi_Deva', 'Nepali', 'ne', 'Nepali'),
-    NllbLang('nso_Latn', 'Northern Sotho'),
-    NllbLang('nus_Latn', 'Nuer'),
-    NllbLang('nya_Latn', 'Nyanja'),
-    NllbLang('oci_Latn', 'Occitan', 'oc', 'Occitan'),
-    NllbLang('gaz_Latn', 'West Central Oromo'),
-    NllbLang('ory_Orya', 'Odia'),
-    NllbLang('pag_Latn', 'Pangasinan'),
-    NllbLang('pan_Guru', 'Eastern Panjabi', 'pa', 'Punjabi'),
-    NllbLang('pap_Latn', 'Papiamento'),
-    NllbLang('pes_Arab', 'Western Persian', 'fa', 'Persian'),
-    NllbLang('pol_Latn', 'Polish', 'pl', 'Polish'),
-    NllbLang('por_Latn', 'Portuguese', 'pt', 'Portuguese'),
-    NllbLang('prs_Arab', 'Dari'),
-    NllbLang('pbt_Arab', 'Southern Pashto', 'ps', 'Pashto'),
-    NllbLang('quy_Latn', 'Ayacucho Quechua'),
-    NllbLang('ron_Latn', 'Romanian', 'ro', 'Romanian'),
-    NllbLang('run_Latn', 'Rundi'),
-    NllbLang('rus_Cyrl', 'Russian', 'ru', 'Russian'),
-    NllbLang('sag_Latn', 'Sango'),
-    NllbLang('san_Deva', 'Sanskrit', 'sa', 'Sanskrit'),
-    NllbLang('sat_Olck', 'Santali'),
-    NllbLang('scn_Latn', 'Sicilian'),
-    NllbLang('shn_Mymr', 'Shan'),
-    NllbLang('sin_Sinh', 'Sinhala', 'si', 'Sinhala'),
-    NllbLang('slk_Latn', 'Slovak', 'sk', 'Slovak'),
-    NllbLang('slv_Latn', 'Slovenian', 'sl', 'Slovenian'),
-    NllbLang('smo_Latn', 'Samoan'),
-    NllbLang('sna_Latn', 'Shona', 'sn', 'Shona'),
-    NllbLang('snd_Arab', 'Sindhi', 'sd', 'Sindhi'),
-    NllbLang('som_Latn', 'Somali', 'so', 'Somali'),
-    NllbLang('sot_Latn', 'Southern Sotho'),
-    NllbLang('spa_Latn', 'Spanish', 'es', 'Spanish'),
-    NllbLang('als_Latn', 'Tosk Albanian', 'sq', 'Albanian'),
-    NllbLang('srd_Latn', 'Sardinian'),
-    NllbLang('srp_Cyrl', 'Serbian', 'sr', 'Serbian'),
-    NllbLang('ssw_Latn', 'Swati'),
-    NllbLang('sun_Latn', 'Sundanese', 'su', 'Sundanese'),
-    NllbLang('swe_Latn', 'Swedish', 'sv', 'Swedish'),
-    NllbLang('swh_Latn', 'Swahili', 'sw', 'Swahili'),
-    NllbLang('szl_Latn', 'Silesian'),
-    NllbLang('tam_Taml', 'Tamil', 'ta', 'Tamil'),
-    NllbLang('tat_Cyrl', 'Tatar', 'tt', 'Tatar'),
-    NllbLang('tel_Telu', 'Telugu', 'te', 'Telugu'),
-    NllbLang('tgk_Cyrl', 'Tajik', 'tg', 'Tajik'),
-    NllbLang('tgl_Latn', 'Tagalog', 'tl', 'Tagalog'),
-    NllbLang('tha_Thai', 'Thai', 'th', 'Thai'),
-    NllbLang('tir_Ethi', 'Tigrinya'),
-    NllbLang('taq_Latn', 'Tamasheq (Latin script)'),
-    NllbLang('taq_Tfng', 'Tamasheq (Tifinagh script)'),
-    NllbLang('tpi_Latn', 'Tok Pisin'),
-    NllbLang('tsn_Latn', 'Tswana'),
-    NllbLang('tso_Latn', 'Tsonga'),
-    NllbLang('tuk_Latn', 'Turkmen', 'tk', 'Turkmen'),
-    NllbLang('tum_Latn', 'Tumbuka'),
-    NllbLang('tur_Latn', 'Turkish', 'tr', 'Turkish'),
-    NllbLang('twi_Latn', 'Twi'),
-    NllbLang('tzm_Tfng', 'Central Atlas Tamazight'),
-    NllbLang('uig_Arab', 'Uyghur'),
-    NllbLang('ukr_Cyrl', 'Ukrainian', 'uk', 'Ukrainian'),
-    NllbLang('umb_Latn', 'Umbundu'),
-    NllbLang('urd_Arab', 'Urdu', 'ur', 'Urdu'),
-    NllbLang('uzn_Latn', 'Northern Uzbek', 'uz', 'Uzbek'),
-    NllbLang('vec_Latn', 'Venetian'),
-    NllbLang('vie_Latn', 'Vietnamese', 'vi', 'Vietnamese'),
-    NllbLang('war_Latn', 'Waray'),
-    NllbLang('wol_Latn', 'Wolof'),
-    NllbLang('xho_Latn', 'Xhosa'),
-    NllbLang('ydd_Hebr', 'Eastern Yiddish', 'yi', 'Yiddish'),
-    NllbLang('yor_Latn', 'Yoruba', 'yo', 'Yoruba'),
-    NllbLang('yue_Hant', 'Yue Chinese', 'zh', 'Chinese'),
-    NllbLang('zho_Hans', 'Chinese (Simplified)', 'zh', 'Chinese'),
-    NllbLang('zho_Hant', 'Chinese (Traditional)', 'zh', 'Chinese'),
-    NllbLang('zsm_Latn', 'Standard Malay', 'ms', 'Malay'),
-    NllbLang('zul_Latn', 'Zulu'),
-]
-_TO_NLLB_LANG_CODE = {language.code.lower(): language for language in NLLB_LANGS if language.code is not None}
-_TO_NLLB_LANG_NAME = {language.name.lower(): language for language in NLLB_LANGS if language.name is not None}
-_TO_NLLB_LANG_WHISPER_CODE = {language.code_whisper.lower(): language for language in NLLB_LANGS if language.code_whisper is not None}
-_TO_NLLB_LANG_WHISPER_NAME = {language.name_whisper.lower(): language for language in NLLB_LANGS if language.name_whisper is not None}
-def get_nllb_lang_from_code(lang_code, default=None) -> NllbLang:
-    """Return the language from the language code."""
-    return _TO_NLLB_LANG_CODE.get(lang_code, default)
-def get_nllb_lang_from_name(lang_name, default=None) -> NllbLang:
-    """Return the language from the language name."""
-    return _TO_NLLB_LANG_NAME.get(lang_name.lower() if lang_name else None, default)
-def get_nllb_lang_from_code_whisper(lang_code_whisper, default=None) -> NllbLang:
-    """Return the language from the language code."""
-    return _TO_NLLB_LANG_WHISPER_CODE.get(lang_code_whisper, default)
-def get_nllb_lang_from_name_whisper(lang_name_whisper, default=None) -> NllbLang:
-    """Return the language from the language name."""
-    return _TO_NLLB_LANG_WHISPER_NAME.get(lang_name_whisper.lower() if lang_name_whisper else None, default)
-def get_nllb_lang_names():
-    """Return a list of language names."""
-    return [language.name for language in NLLB_LANGS]
-if __name__ == "__main__":
-    # Test lookup
-    print(get_nllb_lang_from_code('eng_Latn'))
-    print(get_nllb_lang_from_name('English'))
-    print(get_nllb_lang_names())

src/translation/translationLangs.py ADDED Viewed

	@@ -0,0 +1,303 @@

+class Lang():
+    def __init__(self, code: str, *names: str):
+        self.code = code
+        self.names = names
+    def __repr__(self):
+        return f"code:{self.code}, name:{self.names}"
+class TranslationLang():
+    def __init__(self, nllb: Lang, whisper: Lang = None, m2m100: Lang = None):
+        self.nllb    = nllb
+        self.whisper = whisper
+        self.m2m100  = None
+        if m2m100 is None: m2m100 = whisper
+        if m2m100 is not None and len(m2m100.names) > 0:
+            self.m2m100  = m2m100
+    def __repr__(self):
+        result = ""
+        if self.nllb is not None:
+            result += f"NLLB={self.nllb} "
+        if self.whisper is not None:
+            result += f"WHISPER={self.whisper} "
+        if self.m2m100 is not None:
+            result += f"M@M100={self.m2m100} "
+        return f"Language {result}"
+"""
+Model available Languages
+[NLLB]
+ace_Latn:Acehnese (Latin script), aka_Latn:Akan, als_Latn:Tosk Albanian, amh_Ethi:Amharic, asm_Beng:Assamese, awa_Deva:Awadhi, ayr_Latn:Central Aymara, azb_Arab:South Azerbaijani, azj_Latn:North Azerbaijani, bak_Cyrl:Bashkir, bam_Latn:Bambara, ban_Latn:Balinese, bel_Cyrl:Belarusian, bem_Latn:Bemba, ben_Beng:Bengali, bho_Deva:Bhojpuri, bjn_Latn:Banjar (Latin script), bod_Tibt:Standard Tibetan, bug_Latn:Buginese, ceb_Latn:Cebuano, cjk_Latn:Chokwe, ckb_Arab:Central Kurdish, crh_Latn:Crimean Tatar, cym_Latn:Welsh, dik_Latn:Southwestern Dinka, diq_Latn:Southern Zaza, dyu_Latn:Dyula, dzo_Tibt:Dzongkha, ewe_Latn:Ewe, fao_Latn:Faroese, fij_Latn:Fijian, fon_Latn:Fon, fur_Latn:Friulian, fuv_Latn:Nigerian Fulfulde, gaz_Latn:West Central Oromo, gla_Latn:Scottish Gaelic, gle_Latn:Irish, grn_Latn:Guarani, guj_Gujr:Gujarati, hat_Latn:Haitian Creole, hau_Latn:Hausa, hin_Deva:Hindi, hne_Deva:Chhattisgarhi, hye_Armn:Armenian, ibo_Latn:Igbo, ilo_Latn:Ilocano, ind_Latn:Indonesian, jav_Latn:Javanese, kab_Latn:Kabyle, kac_Latn:Jingpho, kam_Latn:Kamba, kan_Knda:Kannada, kas_Arab:Kashmiri (Arabic script), kas_Deva:Kashmiri (Devanagari script), kat_Geor:Georgian, kaz_Cyrl:Kazakh, kbp_Latn:Kabiyè, kea_Latn:Kabuverdianu, khk_Cyrl:Halh Mongolian, khm_Khmr:Khmer, kik_Latn:Kikuyu, kin_Latn:Kinyarwanda, kir_Cyrl:Kyrgyz, kmb_Latn:Kimbundu, kmr_Latn:Northern Kurdish, knc_Arab:Central Kanuri (Arabic script), knc_Latn:Central Kanuri (Latin script), kon_Latn:Kikongo, lao_Laoo:Lao, lij_Latn:Ligurian, lim_Latn:Limburgish, lin_Latn:Lingala, lmo_Latn:Lombard, ltg_Latn:Latgalian, ltz_Latn:Luxembourgish, lua_Latn:Luba-Kasai, lug_Latn:Ganda, luo_Latn:Luo, lus_Latn:Mizo, mag_Deva:Magahi, mai_Deva:Maithili, mal_Mlym:Malayalam, mar_Deva:Marathi, min_Latn:Minangkabau (Latin script), mlt_Latn:Maltese, mni_Beng:Meitei (Bengali script), mos_Latn:Mossi, mri_Latn:Maori, mya_Mymr:Burmese, npi_Deva:Nepali, nso_Latn:Northern Sotho, nus_Latn:Nuer, nya_Latn:Nyanja, ory_Orya:Odia, pag_Latn:Pangasinan, pan_Guru:Eastern Panjabi, pap_Latn:Papiamento, pbt_Arab:Southern Pashto, pes_Arab:Western Persian, plt_Latn:Plateau Malagasy, prs_Arab:Dari, quy_Latn:Ayacucho Quechua, run_Latn:Rundi, sag_Latn:Sango, san_Deva:Sanskrit, sat_Beng:Santali, scn_Latn:Sicilian, shn_Mymr:Shan, sin_Sinh:Sinhala, smo_Latn:Samoan, sna_Latn:Shona, snd_Arab:Sindhi, som_Latn:Somali, sot_Latn:Southern Sotho, srd_Latn:Sardinian, ssw_Latn:Swati, sun_Latn:Sundanese, swh_Latn:Swahili, szl_Latn:Silesian, tam_Taml:Tamil, taq_Latn:Tamasheq (Latin script), tat_Cyrl:Tatar, tel_Telu:Telugu, tgk_Cyrl:Tajik, tgl_Latn:Tagalog, tha_Thai:Thai, tir_Ethi:Tigrinya, tpi_Latn:Tok Pisin, tsn_Latn:Tswana, tso_Latn:Tsonga, tuk_Latn:Turkmen, tum_Latn:Tumbuka, tur_Latn:Turkish, twi_Latn:Twi, tzm_Tfng:Central Atlas Tamazight, uig_Arab:Uyghur, umb_Latn:Umbundu, urd_Arab:Urdu, uzn_Latn:Northern Uzbek, vec_Latn:Venetian, war_Latn:Waray, wol_Latn:Wolof, xho_Latn:Xhosa, ydd_Hebr:Eastern Yiddish, yor_Latn:Yoruba, zsm_Latn:Standard Malay, zul_Latn:Zulu
+https://github.com/facebookresearch/LASER/blob/main/nllb/README.md
+In the NLLB model, languages are identified by a FLORES-200 code of the form {language}_{script}, where the language is an ISO 639-3 code and the script is an ISO 15924 code.
+https://github.com/sillsdev/serval/wiki/FLORES%E2%80%90200-Language-Code-Resolution-for-NMT-Engine
+[whisper]
+en:english, zh:chinese, de:german, es:spanish, ru:russian, ko:korean, fr:french, ja:japanese, pt:portuguese, tr:turkish, pl:polish, ca:catalan, nl:dutch, ar:arabic, sv:swedish, it:italian, id:indonesian, hi:hindi, fi:finnish, vi:vietnamese, he:hebrew, uk:ukrainian, el:greek, ms:malay, cs:czech, ro:romanian, da:danish, hu:hungarian, ta:tamil, no:norwegian, th:thai, ur:urdu, hr:croatian, bg:bulgarian, lt:lithuanian, la:latin, mi:maori, ml:malayalam, cy:welsh, sk:slovak, te:telugu, fa:persian, lv:latvian, bn:bengali, sr:serbian, az:azerbaijani, sl:slovenian, kn:kannada, et:estonian, mk:macedonian, br:breton, eu:basque, is:icelandic, hy:armenian, ne:nepali, mn:mongolian, bs:bosnian, kk:kazakh, sq:albanian, sw:swahili, gl:galician, mr:marathi, pa:punjabi, si:sinhala, km:khmer, sn:shona, yo:yoruba, so:somali, af:afrikaans, oc:occitan, ka:georgian, be:belarusian, tg:tajik, sd:sindhi, gu:gujarati, am:amharic, yi:yiddish, lo:lao, uz:uzbek, fo:faroese, ht:haitian creole, ps:pashto, tk:turkmen, nn:nynorsk, mt:maltese, sa:sanskrit, lb:luxembourgish, my:myanmar, bo:tibetan, tl:tagalog, mg:malagasy, as:assamese, tt:tatar, haw:hawaiian, ln:lingala, ha:hausa, ba:bashkir, jw:javanese, su:sundanese, yue:cantonese,
+https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+[m2m100]
+af:Afrikaans, am:Amharic, ar:Arabic, ast:Asturian, az:Azerbaijani, ba:Bashkir, be:Belarusian, bg:Bulgarian, bn:Bengali, br:Breton, bs:Bosnian, ca:Catalan; Valencian, ceb:Cebuano, cs:Czech, cy:Welsh, da:Danish, de:German, el:Greek, en:English, es:Spanish, et:Estonian, fa:Persian, ff:Fulah, fi:Finnish, fr:French, fy:Western Frisian, ga:Irish, gd:Gaelic; Scottish Gaelic, gl:Galician, gu:Gujarati, ha:Hausa, he:Hebrew, hi:Hindi, hr:Croatian, ht:Haitian; Haitian Creole, hu:Hungarian, hy:Armenian, id:Indonesian, ig:Igbo, ilo:Iloko, is:Icelandic, it:Italian, ja:Japanese, jv:Javanese, ka:Georgian, kk:Kazakh, km:Central Khmer, kn:Kannada, ko:Korean, lb:Luxembourgish; Letzeburgesch, lg:Ganda, ln:Lingala, lo:Lao, lt:Lithuanian, lv:Latvian, mg:Malagasy, mk:Macedonian, ml:Malayalam, mn:Mongolian, mr:Marathi, ms:Malay, my:Burmese, ne:Nepali, nl:Dutch; Flemish, no:Norwegian, ns:Northern Sotho, Occitan (oc:post 1500), or:Oriya, pa:Panjabi; Punjabi, pl:Polish, ps:Pushto; Pashto, pt:Portuguese, ro:Romanian; Moldavian; Moldovan, ru:Russian, sd:Sindhi, si:Sinhala; Sinhalese, sk:Slovak, sl:Slovenian, so:Somali, sq:Albanian, sr:Serbian, ss:Swati, su:Sundanese, sv:Swedish, sw:Swahili, ta:Tamil, th:Thai, tl:Tagalog, tn:Tswana, tr:Turkish, uk:Ukrainian, ur:Urdu, uz:Uzbek, vi:Vietnamese, wo:Wolof, xh:Xhosa, yi:Yiddish, yo:Yoruba, zh:Chinese, zu:Zulu
+https://huggingface.co/facebook/m2m100_1.2B
+The available languages for m2m100 and whisper are almost identical. Most of the codes correspond to the ISO 639-1 standard. For detailed information, please refer to the official documentation provided.
+"""
+TranslationLangs = [
+    TranslationLang(Lang("ace_Arab", "Acehnese (Arabic script)")),
+    TranslationLang(Lang("ace_Latn", "Acehnese (Latin script)")),
+    TranslationLang(Lang("acm_Arab", "Mesopotamian Arabic"), Lang("ar", "Arabic")),
+    TranslationLang(Lang("acq_Arab", "Ta’izzi-Adeni Arabic"), Lang("ar", "Arabic")),
+    TranslationLang(Lang("aeb_Arab", "Tunisian Arabic")),
+    TranslationLang(Lang("afr_Latn", "Afrikaans"), Lang("af", "Afrikaans")),
+    TranslationLang(Lang("ajp_Arab", "South Levantine Arabic"), Lang("ar", "Arabic")),
+    TranslationLang(Lang("aka_Latn", "Akan")),
+    TranslationLang(Lang("amh_Ethi", "Amharic"), Lang("am", "Amharic")),
+    TranslationLang(Lang("apc_Arab", "North Levantine Arabic"), Lang("ar", "Arabic")),
+    TranslationLang(Lang("arb_Arab", "Modern Standard Arabic"), Lang("ar", "Arabic")),
+    TranslationLang(Lang("arb_Latn", "Modern Standard Arabic (Romanized)")),
+    TranslationLang(Lang("ars_Arab", "Najdi Arabic"), Lang("ar", "Arabic")),
+    TranslationLang(Lang("ary_Arab", "Moroccan Arabic"), Lang("ar", "Arabic")),
+    TranslationLang(Lang("arz_Arab", "Egyptian Arabic"), Lang("ar", "Arabic")),
+    TranslationLang(Lang("asm_Beng", "Assamese"), Lang("as", "Assamese")),
+    TranslationLang(Lang("ast_Latn", "Asturian"), None, Lang("ast", "Asturian")),
+    TranslationLang(Lang("awa_Deva", "Awadhi")),
+    TranslationLang(Lang("ayr_Latn", "Central Aymara")),
+    TranslationLang(Lang("azb_Arab", "South Azerbaijani"), Lang("az", "Azerbaijani")),
+    TranslationLang(Lang("azj_Latn", "North Azerbaijani"), Lang("az", "Azerbaijani")),
+    TranslationLang(Lang("bak_Cyrl", "Bashkir"), Lang("ba", "Bashkir")),
+    TranslationLang(Lang("bam_Latn", "Bambara")),
+    TranslationLang(Lang("ban_Latn", "Balinese")),
+    TranslationLang(Lang("bel_Cyrl", "Belarusian"), Lang("be", "Belarusian")),
+    TranslationLang(Lang("bem_Latn", "Bemba")),
+    TranslationLang(Lang("ben_Beng", "Bengali"), Lang("bn", "Bengali")),
+    TranslationLang(Lang("bho_Deva", "Bhojpuri")),
+    TranslationLang(Lang("bjn_Arab", "Banjar (Arabic script)")),
+    TranslationLang(Lang("bjn_Latn", "Banjar (Latin script)")),
+    TranslationLang(Lang("bod_Tibt", "Standard Tibetan"), Lang("bo", "Tibetan")),
+    TranslationLang(Lang("bos_Latn", "Bosnian"), Lang("bs", "Bosnian")),
+    TranslationLang(Lang("bug_Latn", "Buginese")),
+    TranslationLang(Lang("bul_Cyrl", "Bulgarian"), Lang("bg", "Bulgarian")),
+    TranslationLang(Lang("cat_Latn", "Catalan"), Lang("ca", "Catalan", "valencian")),
+    TranslationLang(Lang("ceb_Latn", "Cebuano"), None, Lang("ceb", "Cebuano")),
+    TranslationLang(Lang("ces_Latn", "Czech"), Lang("cs", "Czech")),
+    TranslationLang(Lang("cjk_Latn", "Chokwe")),
+    TranslationLang(Lang("ckb_Arab", "Central Kurdish")),
+    TranslationLang(Lang("crh_Latn", "Crimean Tatar")),
+    TranslationLang(Lang("cym_Latn", "Welsh"), Lang("cy", "Welsh")),
+    TranslationLang(Lang("dan_Latn", "Danish"), Lang("da", "Danish")),
+    TranslationLang(Lang("deu_Latn", "German"), Lang("de", "German")),
+    TranslationLang(Lang("dik_Latn", "Southwestern Dinka")),
+    TranslationLang(Lang("dyu_Latn", "Dyula")),
+    TranslationLang(Lang("dzo_Tibt", "Dzongkha")),
+    TranslationLang(Lang("ell_Grek", "Greek"), Lang("el", "Greek")),
+    TranslationLang(Lang("eng_Latn", "English"), Lang("en", "English")),
+    TranslationLang(Lang("epo_Latn", "Esperanto")),
+    TranslationLang(Lang("est_Latn", "Estonian"), Lang("et", "Estonian")),
+    TranslationLang(Lang("eus_Latn", "Basque"), Lang("eu", "Basque")),
+    TranslationLang(Lang("ewe_Latn", "Ewe")),
+    TranslationLang(Lang("fao_Latn", "Faroese"), Lang("fo", "Faroese")),
+    TranslationLang(Lang("fij_Latn", "Fijian")),
+    TranslationLang(Lang("fin_Latn", "Finnish"), Lang("fi", "Finnish")),
+    TranslationLang(Lang("fon_Latn", "Fon")),
+    TranslationLang(Lang("fra_Latn", "French"), Lang("fr", "French")),
+    TranslationLang(Lang("fur_Latn", "Friulian")),
+    TranslationLang(Lang("fuv_Latn", "Nigerian Fulfulde"), None, Lang("ff", "Fulah")),
+    TranslationLang(Lang("gla_Latn", "Scottish Gaelic"), None, Lang("gd", "Scottish Gaelic")),
+    TranslationLang(Lang("gle_Latn", "Irish"), None, Lang("ga", "Irish")),
+    TranslationLang(Lang("glg_Latn", "Galician"), Lang("gl", "Galician")),
+    TranslationLang(Lang("grn_Latn", "Guarani")),
+    TranslationLang(Lang("guj_Gujr", "Gujarati"), Lang("gu", "Gujarati")),
+    TranslationLang(Lang("hat_Latn", "Haitian Creole"), Lang("ht", "Haitian creole", "haitian")),
+    TranslationLang(Lang("hau_Latn", "Hausa"), Lang("ha", "Hausa")),
+    TranslationLang(Lang("heb_Hebr", "Hebrew"), Lang("he", "Hebrew")),
+    TranslationLang(Lang("hin_Deva", "Hindi"), Lang("hi", "Hindi")),
+    TranslationLang(Lang("hne_Deva", "Chhattisgarhi")),
+    TranslationLang(Lang("hrv_Latn", "Croatian"), Lang("hr", "Croatian")),
+    TranslationLang(Lang("hun_Latn", "Hungarian"), Lang("hu", "Hungarian")),
+    TranslationLang(Lang("hye_Armn", "Armenian"), Lang("hy", "Armenian")),
+    TranslationLang(Lang("ibo_Latn", "Igbo"), None, Lang("ig", "Igbo")),
+    TranslationLang(Lang("ilo_Latn", "Ilocano"), None, Lang("ilo", "Iloko")),
+    TranslationLang(Lang("ind_Latn", "Indonesian"), Lang("id", "Indonesian")),
+    TranslationLang(Lang("isl_Latn", "Icelandic"), Lang("is", "Icelandic")),
+    TranslationLang(Lang("ita_Latn", "Italian"), Lang("it", "Italian")),
+    TranslationLang(Lang("jav_Latn", "Javanese"), Lang("jw", "Javanese"), Lang("jv", "Javanese")),
+    TranslationLang(Lang("jpn_Jpan", "Japanese"), Lang("ja", "Japanese")),
+    TranslationLang(Lang("kab_Latn", "Kabyle")),
+    TranslationLang(Lang("kac_Latn", "Jingpho")),
+    TranslationLang(Lang("kam_Latn", "Kamba")),
+    TranslationLang(Lang("kan_Knda", "Kannada"), Lang("kn", "Kannada")),
+    TranslationLang(Lang("kas_Arab", "Kashmiri (Arabic script)")),
+    TranslationLang(Lang("kas_Deva", "Kashmiri (Devanagari script)")),
+    TranslationLang(Lang("kat_Geor", "Georgian"), Lang("ka", "Georgian")),
+    TranslationLang(Lang("knc_Arab", "Central Kanuri (Arabic script)")),
+    TranslationLang(Lang("knc_Latn", "Central Kanuri (Latin script)")),
+    TranslationLang(Lang("kaz_Cyrl", "Kazakh"), Lang("kk", "Kazakh")),
+    TranslationLang(Lang("kbp_Latn", "Kabiyè")),
+    TranslationLang(Lang("kea_Latn", "Kabuverdianu")),
+    TranslationLang(Lang("khm_Khmr", "Khmer"), Lang("km", "Khmer")),
+    TranslationLang(Lang("kik_Latn", "Kikuyu")),
+    TranslationLang(Lang("kin_Latn", "Kinyarwanda")),
+    TranslationLang(Lang("kir_Cyrl", "Kyrgyz")),
+    TranslationLang(Lang("kmb_Latn", "Kimbundu")),
+    TranslationLang(Lang("kmr_Latn", "Northern Kurdish")),
+    TranslationLang(Lang("kon_Latn", "Kikongo")),
+    TranslationLang(Lang("kor_Hang", "Korean"), Lang("ko", "Korean")),
+    TranslationLang(Lang("lao_Laoo", "Lao"), Lang("lo", "Lao")),
+    TranslationLang(Lang("lij_Latn", "Ligurian")),
+    TranslationLang(Lang("lim_Latn", "Limburgish")),
+    TranslationLang(Lang("lin_Latn", "Lingala"), Lang("ln", "Lingala")),
+    TranslationLang(Lang("lit_Latn", "Lithuanian"), Lang("lt", "Lithuanian")),
+    TranslationLang(Lang("lmo_Latn", "Lombard")),
+    TranslationLang(Lang("ltg_Latn", "Latgalian")),
+    TranslationLang(Lang("ltz_Latn", "Luxembourgish"), Lang("lb", "Luxembourgish", "letzeburgesch")),
+    TranslationLang(Lang("lua_Latn", "Luba-Kasai")),
+    TranslationLang(Lang("lug_Latn", "Ganda"), None, Lang("lg", "Ganda")),
+    TranslationLang(Lang("luo_Latn", "Luo")),
+    TranslationLang(Lang("lus_Latn", "Mizo")),
+    TranslationLang(Lang("lvs_Latn", "Standard Latvian"), Lang("lv", "Latvian")),
+    TranslationLang(Lang("mag_Deva", "Magahi")),
+    TranslationLang(Lang("mai_Deva", "Maithili")),
+    TranslationLang(Lang("mal_Mlym", "Malayalam"), Lang("ml", "Malayalam")),
+    TranslationLang(Lang("mar_Deva", "Marathi"), Lang("mr", "Marathi")),
+    TranslationLang(Lang("min_Arab", "Minangkabau (Arabic script)")),
+    TranslationLang(Lang("min_Latn", "Minangkabau (Latin script)")),
+    TranslationLang(Lang("mkd_Cyrl", "Macedonian"), Lang("mk", "Macedonian")),
+    TranslationLang(Lang("plt_Latn", "Plateau Malagasy"), Lang("mg", "Malagasy")),
+    TranslationLang(Lang("mlt_Latn", "Maltese"), Lang("mt", "Maltese")),
+    TranslationLang(Lang("mni_Beng", "Meitei (Bengali script)")),
+    TranslationLang(Lang("khk_Cyrl", "Halh Mongolian"), Lang("mn", "Mongolian")),
+    TranslationLang(Lang("mos_Latn", "Mossi")),
+    TranslationLang(Lang("mri_Latn", "Maori"), Lang("mi", "Maori")),
+    TranslationLang(Lang("mya_Mymr", "Burmese"), Lang("my", "Myanmar", "burmese")),
+    TranslationLang(Lang("nld_Latn", "Dutch"), Lang("nl", "Dutch", "flemish")),
+    TranslationLang(Lang("nno_Latn", "Norwegian Nynorsk"), Lang("nn", "Nynorsk")),
+    TranslationLang(Lang("nob_Latn", "Norwegian Bokmål"), Lang("no", "Norwegian")),
+    TranslationLang(Lang("npi_Deva", "Nepali"), Lang("ne", "Nepali")),
+    TranslationLang(Lang("nso_Latn", "Northern Sotho"), None, Lang("ns", "Northern Sotho")),
+    TranslationLang(Lang("nus_Latn", "Nuer")),
+    TranslationLang(Lang("nya_Latn", "Nyanja")),
+    TranslationLang(Lang("oci_Latn", "Occitan"), Lang("oc", "Occitan")),
+    TranslationLang(Lang("gaz_Latn", "West Central Oromo")),
+    TranslationLang(Lang("ory_Orya", "Odia"), None, Lang("or", "Oriya")),
+    TranslationLang(Lang("pag_Latn", "Pangasinan")),
+    TranslationLang(Lang("pan_Guru", "Eastern Panjabi"), Lang("pa", "Punjabi", "panjabi")),
+    TranslationLang(Lang("pap_Latn", "Papiamento")),
+    TranslationLang(Lang("pes_Arab", "Western Persian"), Lang("fa", "Persian")),
+    TranslationLang(Lang("pol_Latn", "Polish"), Lang("pl", "Polish")),
+    TranslationLang(Lang("por_Latn", "Portuguese"), Lang("pt", "Portuguese")),
+    TranslationLang(Lang("prs_Arab", "Dari")),
+    TranslationLang(Lang("pbt_Arab", "Southern Pashto"), Lang("ps", "Pashto", "pushto")),
+    TranslationLang(Lang("quy_Latn", "Ayacucho Quechua")),
+    TranslationLang(Lang("ron_Latn", "Romanian"), Lang("ro", "Romanian", "moldavian", "moldovan")),
+    TranslationLang(Lang("run_Latn", "Rundi")),
+    TranslationLang(Lang("rus_Cyrl", "Russian"), Lang("ru", "Russian")),
+    TranslationLang(Lang("sag_Latn", "Sango")),
+    TranslationLang(Lang("san_Deva", "Sanskrit"), Lang("sa", "Sanskrit")),
+    TranslationLang(Lang("sat_Olck", "Santali")),
+    TranslationLang(Lang("scn_Latn", "Sicilian")),
+    TranslationLang(Lang("shn_Mymr", "Shan")),
+    TranslationLang(Lang("sin_Sinh", "Sinhala"), Lang("si", "Sinhala", "sinhalese")),
+    TranslationLang(Lang("slk_Latn", "Slovak"), Lang("sk", "Slovak")),
+    TranslationLang(Lang("slv_Latn", "Slovenian"), Lang("sl", "Slovenian")),
+    TranslationLang(Lang("smo_Latn", "Samoan")),
+    TranslationLang(Lang("sna_Latn", "Shona"), Lang("sn", "Shona")),
+    TranslationLang(Lang("snd_Arab", "Sindhi"), Lang("sd", "Sindhi")),
+    TranslationLang(Lang("som_Latn", "Somali"), Lang("so", "Somali")),
+    TranslationLang(Lang("sot_Latn", "Southern Sotho")),
+    TranslationLang(Lang("spa_Latn", "Spanish"), Lang("es", "Spanish", "castilian")),
+    TranslationLang(Lang("als_Latn", "Tosk Albanian"), Lang("sq", "Albanian")),
+    TranslationLang(Lang("srd_Latn", "Sardinian")),
+    TranslationLang(Lang("srp_Cyrl", "Serbian"), Lang("sr", "Serbian")),
+    TranslationLang(Lang("ssw_Latn", "Swati"), None, Lang("ss", "Swati")),
+    TranslationLang(Lang("sun_Latn", "Sundanese"), Lang("su", "Sundanese")),
+    TranslationLang(Lang("swe_Latn", "Swedish"), Lang("sv", "Swedish")),
+    TranslationLang(Lang("swh_Latn", "Swahili"), Lang("sw", "Swahili")),
+    TranslationLang(Lang("szl_Latn", "Silesian")),
+    TranslationLang(Lang("tam_Taml", "Tamil"), Lang("ta", "Tamil")),
+    TranslationLang(Lang("tat_Cyrl", "Tatar"), Lang("tt", "Tatar")),
+    TranslationLang(Lang("tel_Telu", "Telugu"), Lang("te", "Telugu")),
+    TranslationLang(Lang("tgk_Cyrl", "Tajik"), Lang("tg", "Tajik")),
+    TranslationLang(Lang("tgl_Latn", "Tagalog"), Lang("tl", "Tagalog")),
+    TranslationLang(Lang("tha_Thai", "Thai"), Lang("th", "Thai")),
+    TranslationLang(Lang("tir_Ethi", "Tigrinya")),
+    TranslationLang(Lang("taq_Latn", "Tamasheq (Latin script)")),
+    TranslationLang(Lang("taq_Tfng", "Tamasheq (Tifinagh script)")),
+    TranslationLang(Lang("tpi_Latn", "Tok Pisin")),
+    TranslationLang(Lang("tsn_Latn", "Tswana"), None, Lang("tn", "Tswana")),
+    TranslationLang(Lang("tso_Latn", "Tsonga")),
+    TranslationLang(Lang("tuk_Latn", "Turkmen"), Lang("tk", "Turkmen")),
+    TranslationLang(Lang("tum_Latn", "Tumbuka")),
+    TranslationLang(Lang("tur_Latn", "Turkish"), Lang("tr", "Turkish")),
+    TranslationLang(Lang("twi_Latn", "Twi")),
+    TranslationLang(Lang("tzm_Tfng", "Central Atlas Tamazight")),
+    TranslationLang(Lang("uig_Arab", "Uyghur")),
+    TranslationLang(Lang("ukr_Cyrl", "Ukrainian"), Lang("uk", "Ukrainian")),
+    TranslationLang(Lang("umb_Latn", "Umbundu")),
+    TranslationLang(Lang("urd_Arab", "Urdu"), Lang("ur", "Urdu")),
+    TranslationLang(Lang("uzn_Latn", "Northern Uzbek"), Lang("uz", "Uzbek")),
+    TranslationLang(Lang("vec_Latn", "Venetian")),
+    TranslationLang(Lang("vie_Latn", "Vietnamese"), Lang("vi", "Vietnamese")),
+    TranslationLang(Lang("war_Latn", "Waray")),
+    TranslationLang(Lang("wol_Latn", "Wolof"), None, Lang("wo", "Wolof")),
+    TranslationLang(Lang("xho_Latn", "Xhosa"), None, Lang("xh", "Xhosa")),
+    TranslationLang(Lang("ydd_Hebr", "Eastern Yiddish"), Lang("yi", "Yiddish")),
+    TranslationLang(Lang("yor_Latn", "Yoruba"), Lang("yo", "Yoruba")),
+    TranslationLang(Lang("yue_Hant", "Yue Chinese"), Lang("yue", "cantonese"), Lang("zh", "Chinese (zh-yue)")),
+    TranslationLang(Lang("zho_Hans", "Chinese (Simplified)"), Lang("zh", "Chinese (Simplified)", "Chinese", "mandarin")),
+    TranslationLang(Lang("zho_Hant", "Chinese (Traditional)"), Lang("zh", "Chinese (Traditional)")),
+    TranslationLang(Lang("zsm_Latn", "Standard Malay"), Lang("ms", "Malay")),
+    TranslationLang(Lang("zul_Latn", "Zulu"), None, Lang("zu", "Zulu")),
+    TranslationLang(None, Lang("br", "Breton")), # Both whisper and m2m100 support the Breton language, but nllb does not have this language.
+]
+_TO_LANG_NAME_NLLB = {name.lower(): language for language in TranslationLangs if language.nllb is not None for name in language.nllb.names}
+_TO_LANG_NAME_M2M100 = {name.lower(): language for language in TranslationLangs if language.m2m100 is not None for name in language.m2m100.names}
+_TO_LANG_NAME_WHISPER = {name.lower(): language for language in TranslationLangs if language.whisper is not None for name in language.whisper.names}
+_TO_LANG_CODE_WHISPER = {language.whisper.code.lower(): language for language in TranslationLangs if language.whisper is not None and len(language.whisper.code) > 0}
+def get_lang_from_nllb_name(nllbName, default=None) -> TranslationLang:
+    """Return the TranslationLang from the lang_name_nllb."""
+    return _TO_LANG_NAME_NLLB.get(nllbName.lower() if nllbName else None, default)
+def get_lang_from_m2m100_name(m2m100Name, default=None) -> TranslationLang:
+    """Return the TranslationLang from the lang_name_m2m100 name."""
+    return _TO_LANG_NAME_M2M100.get(m2m100Name.lower() if m2m100Name else None, default)
+def get_lang_from_whisper_name(whisperName, default=None) -> TranslationLang:
+    """Return the TranslationLang from the lang_name_whisper name."""
+    return _TO_LANG_NAME_WHISPER.get(whisperName.lower() if whisperName else None, default)
+def get_lang_from_whisper_code(whisperCode, default=None) -> TranslationLang:
+    """Return the TranslationLang from the lang_code_whisper."""
+    return _TO_LANG_CODE_WHISPER.get(whisperCode, default)
+def get_lang_nllb_names():
+    """Return a list of nllb language names."""
+    return list(_TO_LANG_NAME_NLLB.keys())
+def get_lang_m2m100_names(codes = []):
+    """Return a list of m2m100 language names."""
+    return list({name.lower(): None for language in TranslationLangs if language.m2m100 is not None and (len(codes) == 0 or any(code in language.m2m100.code for code in codes)) for name in language.m2m100.names}.keys())
+def get_lang_whisper_names():
+    """Return a list of whisper language names."""
+    return list(_TO_LANG_NAME_WHISPER.keys())
+if __name__ == "__main__":
+    # Test lookup
+    print("name:Chinese (Traditional)", get_lang_from_nllb_name("Chinese (Traditional)"))
+    print("name:moldavian", get_lang_from_m2m100_name("moldavian"))
+    print("code:ja", get_lang_from_whisper_code("ja"))
+    print("name:English", get_lang_from_nllb_name('English'))
+    print(get_lang_m2m100_names(["en", "ja", "zh"]))
+    print(get_lang_nllb_names())

src/{nllb/nllbModel.py → translation/translationModel.py} RENAMED Viewed

@@ -9,24 +9,26 @@ import transformers
 from typing import Optional
 from src.config import ModelConfig
-from src.languages import Language
-from src.nllb.nllbLangs import NllbLang, get_nllb_lang_from_code_whisper
-class NllbModel:
     def __init__(
         self,
-        model_config: ModelConfig,
         device: str = None,
-        whisper_lang: Language = None,
-        nllb_lang: NllbLang = None,
-        download_root: Optional[str] = None,
-        local_files_only: bool = False,
-        load_model: bool = False,
     ):
-        """Initializes the Nllb-200 model.
         Args:
-          model_config: Config of the model to use (distilled-600M, distilled-1.3B,
             1.3B, 3.3B...) or a path to a converted
             model directory. When a size is configured, the converted model is downloaded
             from the Hugging Face Hub.
@@ -44,62 +46,72 @@ class NllbModel:
             having multiple workers enables true parallelism when running the model
             (concurrent calls to self.model.generate() will run in parallel).
             This can improve the global throughput at the cost of increased memory usage.
-          download_root: Directory where the models should be saved. If not set, the models
             are saved in the standard Hugging Face cache directory.
-          local_files_only:  If True, avoid downloading the file and return the path to the
             local cached file if it exists.
         """
-        self.whisper_lang = whisper_lang
-        self.nllb_whisper_lang = get_nllb_lang_from_code_whisper(whisper_lang.code.lower() if whisper_lang is not None else "en")
-        self.nllb_lang = nllb_lang
-        self.model_config = model_config
-        if nllb_lang is None:
             return
-        if os.path.isdir(model_config.url):
-            self.model_path = model_config.url
         else:
-            self.model_path = download_model(
-                model_config,
-                local_files_only=local_files_only,
-                cache_dir=download_root,
             )
         if device is None:
             if torch.cuda.is_available():
-                device = "cuda" if "ct2" in self.model_path else "cuda:0"
             else:
                 device = "cpu"
         self.device = device
-        if load_model:
             self.load_model()
     def load_model(self):
-        print('\n\nLoading model: %s\n\n' % self.model_path)
-        if "ct2" in self.model_path:
-            self.target_prefix = [self.nllb_lang.code]
-            self.trans_tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path, src_lang=self.nllb_whisper_lang.code)
-            self.trans_model = ctranslate2.Translator(self.model_path, compute_type="auto", device=self.device)
-        elif "mt5" in self.model_path:
-            self.mt5_prefix = self.whisper_lang.code + "2" + self.nllb_lang.code_whisper + ": "
-            self.trans_tokenizer = transformers.T5Tokenizer.from_pretrained(self.model_path, legacy=False) #requires spiece.model
-            self.trans_model = transformers.MT5ForConditionalGeneration.from_pretrained(self.model_path)
-            self.trans_translator = transformers.pipeline('text2text-generation', model=self.trans_model, device=self.device, tokenizer=self.trans_tokenizer)
-        else: #NLLB
-            self.trans_tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path)
-            self.trans_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.model_path)
-            self.trans_translator = transformers.pipeline('translation', model=self.trans_model, device=self.device, tokenizer=self.trans_tokenizer, src_lang=self.nllb_whisper_lang.code, tgt_lang=self.nllb_lang.code)
     def release_vram(self):
         try:
             if torch.cuda.is_available():
-                if "ct2" not in self.model_path:
                     device = torch.device("cpu")
-                    self.trans_model.to(device)
-                del self.trans_model
                 torch.cuda.empty_cache()
                 print("release vram end.")
         except Exception as e:
@@ -110,16 +122,16 @@ class NllbModel:
         output = None
         result = None
         try:
-            if "ct2" in self.model_path:
-                source = self.trans_tokenizer.convert_ids_to_tokens(self.trans_tokenizer.encode(text))
-                output = self.trans_model.translate_batch([source], target_prefix=[self.target_prefix])
                 target = output[0].hypotheses[0][1:]
-                result = self.trans_tokenizer.decode(self.trans_tokenizer.convert_tokens_to_ids(target))
-            elif "mt5" in self.model_path:
-                output = self.trans_translator(self.mt5_prefix + text, max_length=max_length, num_beams=4)
                 result = output[0]['generated_text']
-            else: #NLLB
-                output = self.trans_translator(text, max_length=max_length)
                 result = output[0]['translation_text']
         except Exception as e:
             print("Error translation text: " + str(e))
@@ -133,6 +145,8 @@ _MODELS = ["distilled-600M", "distilled-1.3B", "1.3B", "3.3B",
            "nllb-200-3.3B-ct2-float16", "nllb-200-1.3B-ct2", "nllb-200-1.3B-ct2-int8", "nllb-200-1.3B-ct2-float16",
            "nllb-200-distilled-1.3B-ct2", "nllb-200-distilled-1.3B-ct2-int8", "nllb-200-distilled-1.3B-ct2-float16",
            "nllb-200-distilled-600M-ct2", "nllb-200-distilled-600M-ct2-int8", "nllb-200-distilled-600M-ct2-float16",
            "mt5-zh-ja-en-trimmed",
            "mt5-zh-ja-en-trimmed-fine-tuned-v1"]
@@ -140,10 +154,10 @@ def check_model_name(name):
     return any(allowed_name in name for allowed_name in _MODELS)
 def download_model(
-    model_config: ModelConfig,
-    output_dir: Optional[str] = None,
-    local_files_only: bool = False,
-    cache_dir: Optional[str] = None,
 ):
     """"download_model" is referenced from the "utils.py" script
       of the "faster_whisper" project, authored by guillaumekln.
@@ -153,13 +167,13 @@ def download_model(
     The model is downloaded from https://huggingface.co/facebook.
     Args:
-      model_config: config of the model to download (facebook/nllb-distilled-600M,
         facebook/nllb-distilled-1.3B, facebook/nllb-1.3B, facebook/nllb-3.3B...).
-      output_dir: Directory where the model should be saved. If not set, the model is saved in
         the cache directory.
-      local_files_only:  If True, avoid downloading the file and return the path to the local
         cached file if it exists.
-      cache_dir: Path to the folder where cached files are stored.
     Returns:
       The path to the downloaded model.
@@ -167,19 +181,20 @@ def download_model(
     Raises:
       ValueError: if the model size is invalid.
     """
-    if not check_model_name(model_config.name):
         raise ValueError(
-            "Invalid model name '%s', expected one of: %s" % (model_config.name, ", ".join(_MODELS))
         )
-    repo_id = model_config.url #"facebook/nllb-200-%s" %
-    allow_patterns = [
         "config.json",
         "generation_config.json",
         "model.bin",
         "pytorch_model.bin",
         "pytorch_model.bin.index.json",
         "pytorch_model-00001-of-00003.bin",
         "pytorch_model-00002-of-00003.bin",
         "pytorch_model-00003-of-00003.bin",
@@ -190,30 +205,31 @@ def download_model(
         "shared_vocabulary.json",
         "special_tokens_map.json",
         "spiece.model",
     ]
     kwargs = {
-        "local_files_only": local_files_only,
-        "allow_patterns": allow_patterns,
         #"tqdm_class": disabled_tqdm,
     }
-    if output_dir is not None:
-        kwargs["local_dir"] = output_dir
         kwargs["local_dir_use_symlinks"] = False
-    if cache_dir is not None:
-        kwargs["cache_dir"] = cache_dir
     try:
-        return huggingface_hub.snapshot_download(repo_id, **kwargs)
     except (
         huggingface_hub.utils.HfHubHTTPError,
         requests.exceptions.ConnectionError,
     ) as exception:
         warnings.warn(
             "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
-            repo_id,
             exception,
         )
         warnings.warn(
@@ -221,4 +237,4 @@ def download_model(
         )
         kwargs["local_files_only"] = True
-        return huggingface_hub.snapshot_download(repo_id, **kwargs)

 from typing import Optional
 from src.config import ModelConfig
+from src.translation.translationLangs import TranslationLang, get_lang_from_whisper_code
+class TranslationModel:
     def __init__(
         self,
+        modelConfig: ModelConfig,
         device: str = None,
+        whisperLang: TranslationLang = None,
+        translationLang: TranslationLang = None,
+        batchSize: int = 2,
+        noRepeatNgramSize: int = 3,
+        numBeams: int = 2,
+        downloadRoot: Optional[str] = None,
+        localFilesOnly: bool = False,
+        loadModel: bool = False,
     ):
+        """Initializes the M2M100 / Nllb-200 / mt5 model.
         Args:
+          modelConfig: Config of the model to use (distilled-600M, distilled-1.3B,
             1.3B, 3.3B...) or a path to a converted
             model directory. When a size is configured, the converted model is downloaded
             from the Hugging Face Hub.
             having multiple workers enables true parallelism when running the model
             (concurrent calls to self.model.generate() will run in parallel).
             This can improve the global throughput at the cost of increased memory usage.
+          downloadRoot: Directory where the models should be saved. If not set, the models
             are saved in the standard Hugging Face cache directory.
+          localFilesOnly:  If True, avoid downloading the file and return the path to the
             local cached file if it exists.
         """
+        self.modelConfig = modelConfig
+        self.whisperLang = whisperLang # self.translationLangWhisper = get_lang_from_whisper_code(whisperLang.code.lower() if whisperLang is not None else "en")
+        self.translationLang = translationLang
+        if translationLang is None:
             return
+        self.batchSize = batchSize
+        self.noRepeatNgramSize = noRepeatNgramSize
+        self.numBeams = numBeams
+        if os.path.isdir(modelConfig.url):
+            self.modelPath = modelConfig.url
         else:
+            self.modelPath = download_model(
+                modelConfig,
+                localFilesOnly=localFilesOnly,
+                cacheDir=downloadRoot,
             )
         if device is None:
             if torch.cuda.is_available():
+                device = "cuda" if "ct2" in self.modelPath else "cuda:0"
             else:
                 device = "cpu"
         self.device = device
+        if loadModel:
             self.load_model()
     def load_model(self):
+        print('\n\nLoading model: %s\n\n' % self.modelPath)
+        if "ct2" in self.modelPath:
+            if "nllb" in self.modelPath:
+                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.nllb.code)
+                self.targetPrefix = [self.translationLang.nllb.code]
+            elif "m2m100" in self.modelPath:
+                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.m2m100.code)
+                self.targetPrefix = [self.transTokenizer.lang_code_to_token[self.translationLang.m2m100.code]]
+            self.transModel = ctranslate2.Translator(self.modelPath, compute_type="auto", device=self.device)
+        elif "mt5" in self.modelPath:
+            self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
+            self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
+            self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath)
+            self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
+        else:
+            self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
+            self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath)
+            if "m2m100" in self.modelPath:
+                self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
+            else: #NLLB
+                self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
     def release_vram(self):
         try:
             if torch.cuda.is_available():
+                if "ct2" not in self.modelPath:
                     device = torch.device("cpu")
+                    self.transModel.to(device)
+                del self.transModel
                 torch.cuda.empty_cache()
                 print("release vram end.")
         except Exception as e:
         output = None
         result = None
         try:
+            if "ct2" in self.modelPath:
+                source = self.transTokenizer.convert_ids_to_tokens(self.transTokenizer.encode(text))
+                output = self.transModel.translate_batch([source], target_prefix=[self.targetPrefix], max_batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, beam_size=self.numBeams)
                 target = output[0].hypotheses[0][1:]
+                result = self.transTokenizer.decode(self.transTokenizer.convert_tokens_to_ids(target))
+            elif "mt5" in self.modelPath:
+                output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
                 result = output[0]['generated_text']
+            else: #M2M100 & NLLB
+                output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
                 result = output[0]['translation_text']
         except Exception as e:
             print("Error translation text: " + str(e))
            "nllb-200-3.3B-ct2-float16", "nllb-200-1.3B-ct2", "nllb-200-1.3B-ct2-int8", "nllb-200-1.3B-ct2-float16",
            "nllb-200-distilled-1.3B-ct2", "nllb-200-distilled-1.3B-ct2-int8", "nllb-200-distilled-1.3B-ct2-float16",
            "nllb-200-distilled-600M-ct2", "nllb-200-distilled-600M-ct2-int8", "nllb-200-distilled-600M-ct2-float16",
+           "m2m100_1.2B-ct2", "m2m100_418M-ct2", "m2m100-12B-ct2",
+           "m2m100_1.2B", "m2m100_418M",
            "mt5-zh-ja-en-trimmed",
            "mt5-zh-ja-en-trimmed-fine-tuned-v1"]
     return any(allowed_name in name for allowed_name in _MODELS)
 def download_model(
+    modelConfig: ModelConfig,
+    outputDir: Optional[str] = None,
+    localFilesOnly: bool = False,
+    cacheDir: Optional[str] = None,
 ):
     """"download_model" is referenced from the "utils.py" script
       of the "faster_whisper" project, authored by guillaumekln.
     The model is downloaded from https://huggingface.co/facebook.
     Args:
+      modelConfig: config of the model to download (facebook/nllb-distilled-600M,
         facebook/nllb-distilled-1.3B, facebook/nllb-1.3B, facebook/nllb-3.3B...).
+      outputDir: Directory where the model should be saved. If not set, the model is saved in
         the cache directory.
+      localFilesOnly:  If True, avoid downloading the file and return the path to the local
         cached file if it exists.
+      cacheDir: Path to the folder where cached files are stored.
     Returns:
       The path to the downloaded model.
     Raises:
       ValueError: if the model size is invalid.
     """
+    if not check_model_name(modelConfig.name):
         raise ValueError(
+            "Invalid model name '%s', expected one of: %s" % (modelConfig.name, ", ".join(_MODELS))
         )
+    repoId = modelConfig.url #"facebook/nllb-200-%s" %
+    allowPatterns = [
         "config.json",
         "generation_config.json",
         "model.bin",
         "pytorch_model.bin",
         "pytorch_model.bin.index.json",
+        "pytorch_model-*.bin",
         "pytorch_model-00001-of-00003.bin",
         "pytorch_model-00002-of-00003.bin",
         "pytorch_model-00003-of-00003.bin",
         "shared_vocabulary.json",
         "special_tokens_map.json",
         "spiece.model",
+        "vocab.json", #m2m100
     ]
     kwargs = {
+        "local_files_only": localFilesOnly,
+        "allow_patterns": allowPatterns,
         #"tqdm_class": disabled_tqdm,
     }
+    if outputDir is not None:
+        kwargs["local_dir"] = outputDir
         kwargs["local_dir_use_symlinks"] = False
+    if cacheDir is not None:
+        kwargs["cache_dir"] = cacheDir
     try:
+        return huggingface_hub.snapshot_download(repoId, **kwargs)
     except (
         huggingface_hub.utils.HfHubHTTPError,
         requests.exceptions.ConnectionError,
     ) as exception:
         warnings.warn(
             "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
+            repoId,
             exception,
         )
         warnings.warn(
         )
         kwargs["local_files_only"] = True
+        return huggingface_hub.snapshot_download(repoId, **kwargs)

src/utils.py CHANGED Viewed

@@ -100,46 +100,91 @@ def write_srt(transcript: Iterator[dict], file: TextIO,
             flush=True,
         )
 def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
     for segment in transcript:
         words: list = segment.get('words', [])
         # Append longest speaker ID if available
         segment_longest_speaker = segment.get('longest_speaker', None)
         if segment_longest_speaker is not None:
             segment_longest_speaker = segment_longest_speaker.replace("SPEAKER", "S")
         if len(words) == 0:
-            # Yield the segment as-is or processed
-            if (maxLineWidth is None or maxLineWidth < 0) and segment_longest_speaker is None:
-                yield segment
-            else:
-                text = segment['text'].strip()
-                # Prepend the longest speaker ID if available
-                if segment_longest_speaker is not None:
-                    text = f"({segment_longest_speaker}) {text}"
-                yield {
-                    'start': segment['start'],
-                    'end': segment['end'],
-                    'text': process_text(text, maxLineWidth)
-                }
             # We are done
             continue
-        subtitle_start = segment['start']
-        subtitle_end = segment['end']
         if segment_longest_speaker is not None:
             # Add the beginning
             words.insert(0, {
                 'start': subtitle_start,
-                'end': subtitle_start,
-                'word': f"({segment_longest_speaker})"
             })
-        text_words = [ this_word["word"] for this_word in words ]
         subtitle_text = __join_words(text_words, maxLineWidth)
         # Iterate over the words in the segment
@@ -154,15 +199,15 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
                     # Display the text up to this point
                     yield {
                         'start': last,
-                        'end': start,
-                        'text': subtitle_text
                     }
                 # Display the text with the current word highlighted
                 yield {
                     'start': start,
-                    'end': end,
-                    'text': __join_words(
                         [
                             {
                                 "word": re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
@@ -180,17 +225,20 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
                 # Display the last part of the text
                 yield {
                     'start': last,
-                    'end': subtitle_end,
-                    'text': subtitle_text
                 }
         # Just return the subtitle text
         else:
-            yield {
                 'start': subtitle_start,
-                'end': subtitle_end,
-                'text': subtitle_text
             }
 def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
     if maxLineWidth is None or maxLineWidth < 0:

             flush=True,
         )
+def write_srt_original(transcript: Iterator[dict], file: TextIO,
+              maxLineWidth=None, highlight_words: bool = False, bilingual: bool = False):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    iterator  = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)
+    for i, segment in enumerate(iterator, start=1):
+        if "original" not in segment:
+            continue
+        original = segment['original'].replace('-->', '->')
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}",
+            file=file,
+            flush=True,
+        )
+        if original is not None: print(f"{original}",
+            file=file,
+            flush=True)
+        if bilingual:
+            text = segment['text'].replace('-->', '->')
+            print(f"{text}\n",
+            file=file,
+            flush=True)
 def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
     for segment in transcript:
         words: list = segment.get('words', [])
         # Append longest speaker ID if available
         segment_longest_speaker = segment.get('longest_speaker', None)
+        # Yield the segment as-is or processed
+        if len(words) == 0 and (maxLineWidth is None or maxLineWidth < 0) and segment_longest_speaker is None:
+           yield segment
         if segment_longest_speaker is not None:
             segment_longest_speaker = segment_longest_speaker.replace("SPEAKER", "S")
+        subtitle_start = segment['start']
+        subtitle_end   = segment['end']
+        text           = segment['text'].strip()
+        original_text  = segment['original'].strip() if 'original' in segment else None
         if len(words) == 0:
+            # Prepend the longest speaker ID if available
+            if segment_longest_speaker is not None:
+                text = f"({segment_longest_speaker}) {text}"
+            result = {
+                'start': subtitle_start,
+                'end'  : subtitle_end,
+                'text' : process_text(text, maxLineWidth)
+            }
+            if original_text is not None and len(original_text) > 0:
+                result.update({'original': process_text(original_text, maxLineWidth)})
+            yield result
             # We are done
             continue
         if segment_longest_speaker is not None:
             # Add the beginning
             words.insert(0, {
                 'start': subtitle_start,
+                'end'  : subtitle_start,
+                'word' : f"({segment_longest_speaker})"
             })
+        text_words = [text] if not highlight_words and original_text is not None and len(original_text) > 0 else [ this_word["word"] for this_word in words ]
         subtitle_text = __join_words(text_words, maxLineWidth)
         # Iterate over the words in the segment
                     # Display the text up to this point
                     yield {
                         'start': last,
+                        'end'  : start,
+                        'text' : subtitle_text
                     }
                 # Display the text with the current word highlighted
                 yield {
                     'start': start,
+                    'end'  : end,
+                    'text' : __join_words(
                         [
                             {
                                 "word": re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
                 # Display the last part of the text
                 yield {
                     'start': last,
+                    'end'  : subtitle_end,
+                    'text' : subtitle_text
                 }
         # Just return the subtitle text
         else:
+            result = {
                 'start': subtitle_start,
+                'end'  : subtitle_end,
+                'text' : subtitle_text
             }
+            if original_text is not None and len(original_text) > 0:
+                result.update({'original': original_text})
+            yield result
 def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
     if maxLineWidth is None or maxLineWidth < 0:

src/vad.py CHANGED Viewed

@@ -242,9 +242,8 @@ class AbstractTranscription(ABC):
                 # Update prompt window
                 self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
-            if detected_language is not None:
-                result['language'] = detected_language
         finally:
             # Notify progress listener that we are done
             if progressListener is not None:

                 # Update prompt window
                 self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
+            result['language'] = detected_language if detected_language is not None else segment_result['language']
         finally:
             # Notify progress listener that we are done
             if progressListener is not None:

src/whisper/abstractWhisperContainer.py CHANGED Viewed

@@ -71,7 +71,7 @@ class AbstractWhisperContainer:
         pass
     @abc.abstractmethod
-    def create_callback(self, language: str = None, task: str = None,
                         prompt_strategy: AbstractPromptStrategy = None,
                         **decodeOptions: dict) -> AbstractWhisperCallback:
         """
@@ -79,8 +79,8 @@ class AbstractWhisperContainer:
         Parameters
         ----------
-        language: str
-            The target language of the transcription. If not specified, the language will be inferred from the audio content.
         task: str
             The task - either translate or transcribe.
         prompt_strategy: AbstractPromptStrategy

         pass
     @abc.abstractmethod
+    def create_callback(self, languageCode: str = None, task: str = None,
                         prompt_strategy: AbstractPromptStrategy = None,
                         **decodeOptions: dict) -> AbstractWhisperCallback:
         """
         Parameters
         ----------
+        languageCode: str
+            The target language code of the transcription. If not specified, the language will be inferred from the audio content.
         task: str
             The task - either translate or transcribe.
         prompt_strategy: AbstractPromptStrategy

src/whisper/fasterWhisperContainer.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import List, Union
 from faster_whisper import WhisperModel, download_model
 from src.config import ModelConfig, VadInitialPromptMode
 from src.hooks.progressListener import ProgressListener
-from src.languages import get_language_from_name
 from src.modelCache import ModelCache
 from src.prompts.abstractPromptStrategy import AbstractPromptStrategy
 from src.whisper.abstractWhisperContainer import AbstractWhisperCallback, AbstractWhisperContainer
@@ -57,7 +56,7 @@ class FasterWhisperContainer(AbstractWhisperContainer):
         model = WhisperModel(model_url, device=device, compute_type=self.compute_type)
         return model
-    def create_callback(self, language: str = None, task: str = None,
                         prompt_strategy: AbstractPromptStrategy = None,
                         **decodeOptions: dict) -> AbstractWhisperCallback:
         """
@@ -65,8 +64,8 @@ class FasterWhisperContainer(AbstractWhisperContainer):
         Parameters
         ----------
-        language: str
-            The target language of the transcription. If not specified, the language will be inferred from the audio content.
         task: str
             The task - either translate or transcribe.
         prompt_strategy: AbstractPromptStrategy
@@ -78,14 +77,14 @@ class FasterWhisperContainer(AbstractWhisperContainer):
         -------
         A WhisperCallback object.
         """
-        return FasterWhisperCallback(self, language=language, task=task, prompt_strategy=prompt_strategy, **decodeOptions)
 class FasterWhisperCallback(AbstractWhisperCallback):
-    def __init__(self, model_container: FasterWhisperContainer, language: str = None, task: str = None,
                  prompt_strategy: AbstractPromptStrategy = None,
                  **decodeOptions: dict):
         self.model_container = model_container
-        self.language = language
         self.task = task
         self.prompt_strategy = prompt_strategy
         self.decodeOptions = decodeOptions
@@ -108,7 +107,6 @@ class FasterWhisperCallback(AbstractWhisperCallback):
             A callback to receive progress updates.
         """
         model: WhisperModel = self.model_container.get_model()
-        language_code = self._lookup_language_code(self.language) if self.language else None
         # Copy decode options and remove options that are not supported by faster-whisper
         decodeOptions = self.decodeOptions.copy()
@@ -139,7 +137,7 @@ class FasterWhisperCallback(AbstractWhisperCallback):
                            if self.prompt_strategy else prompt
         segments_generator, info = model.transcribe(audio, \
-            language=language_code if language_code else detected_language, task=self.task, \
             initial_prompt=initial_prompt, \
             **decodeOptions
         )
@@ -197,11 +195,3 @@ class FasterWhisperCallback(AbstractWhisperCallback):
             return suppress_tokens
         return [int(token) for token in suppress_tokens.split(",")]
-    def _lookup_language_code(self, language: str):
-        language = get_language_from_name(language)
-        if language is None:
-            raise ValueError("Invalid language: " + language)
-        return language.code

 from faster_whisper import WhisperModel, download_model
 from src.config import ModelConfig, VadInitialPromptMode
 from src.hooks.progressListener import ProgressListener
 from src.modelCache import ModelCache
 from src.prompts.abstractPromptStrategy import AbstractPromptStrategy
 from src.whisper.abstractWhisperContainer import AbstractWhisperCallback, AbstractWhisperContainer
         model = WhisperModel(model_url, device=device, compute_type=self.compute_type)
         return model
+    def create_callback(self, languageCode: str = None, task: str = None,
                         prompt_strategy: AbstractPromptStrategy = None,
                         **decodeOptions: dict) -> AbstractWhisperCallback:
         """
         Parameters
         ----------
+        languageCode: str
+            The target language code of the transcription. If not specified, the language will be inferred from the audio content.
         task: str
             The task - either translate or transcribe.
         prompt_strategy: AbstractPromptStrategy
         -------
         A WhisperCallback object.
         """
+        return FasterWhisperCallback(self, languageCode=languageCode, task=task, prompt_strategy=prompt_strategy, **decodeOptions)
 class FasterWhisperCallback(AbstractWhisperCallback):
+    def __init__(self, model_container: FasterWhisperContainer, languageCode: str = None, task: str = None,
                  prompt_strategy: AbstractPromptStrategy = None,
                  **decodeOptions: dict):
         self.model_container = model_container
+        self.languageCode = languageCode
         self.task = task
         self.prompt_strategy = prompt_strategy
         self.decodeOptions = decodeOptions
             A callback to receive progress updates.
         """
         model: WhisperModel = self.model_container.get_model()
         # Copy decode options and remove options that are not supported by faster-whisper
         decodeOptions = self.decodeOptions.copy()
                            if self.prompt_strategy else prompt
         segments_generator, info = model.transcribe(audio, \
+            language=self.languageCode if self.languageCode else detected_language, task=self.task, \
             initial_prompt=initial_prompt, \
             **decodeOptions
         )
             return suppress_tokens
         return [int(token) for token in suppress_tokens.split(",")]

src/whisper/whisperContainer.py CHANGED Viewed

@@ -70,7 +70,7 @@ class WhisperContainer(AbstractWhisperContainer):
         return whisper.load_model(model_path, device=self.device, download_root=self.download_root)
-    def create_callback(self, language: str = None, task: str = None,
                         prompt_strategy: AbstractPromptStrategy = None,
                         **decodeOptions: dict) -> AbstractWhisperCallback:
         """
@@ -78,8 +78,8 @@ class WhisperContainer(AbstractWhisperContainer):
         Parameters
         ----------
-        language: str
-            The target language of the transcription. If not specified, the language will be inferred from the audio content.
         task: str
             The task - either translate or transcribe.
         prompt_strategy: AbstractPromptStrategy
@@ -91,7 +91,7 @@ class WhisperContainer(AbstractWhisperContainer):
         -------
         A WhisperCallback object.
         """
-        return WhisperCallback(self, language=language, task=task, prompt_strategy=prompt_strategy, **decodeOptions)
     def _get_model_path(self, model_config: ModelConfig, root_dir: str = None):
         from src.conversion.hf_converter import convert_hf_whisper
@@ -160,11 +160,11 @@ class WhisperContainer(AbstractWhisperContainer):
         return model_config.path
 class WhisperCallback(AbstractWhisperCallback):
-    def __init__(self, model_container: WhisperContainer, language: str = None, task: str = None,
                  prompt_strategy: AbstractPromptStrategy = None,
                  **decodeOptions: dict):
         self.model_container = model_container
-        self.language = language
         self.task = task
         self.prompt_strategy = prompt_strategy
@@ -204,7 +204,7 @@ class WhisperCallback(AbstractWhisperCallback):
                            if self.prompt_strategy else prompt
         result = model.transcribe(audio, \
-            language=self.language if self.language else detected_language, task=self.task, \
             initial_prompt=initial_prompt, \
             **decodeOptions
         )

         return whisper.load_model(model_path, device=self.device, download_root=self.download_root)
+    def create_callback(self, languageCode: str = None, task: str = None,
                         prompt_strategy: AbstractPromptStrategy = None,
                         **decodeOptions: dict) -> AbstractWhisperCallback:
         """
         Parameters
         ----------
+        languageCode: str
+            The target language code of the transcription. If not specified, the language will be inferred from the audio content.
         task: str
             The task - either translate or transcribe.
         prompt_strategy: AbstractPromptStrategy
         -------
         A WhisperCallback object.
         """
+        return WhisperCallback(self, languageCode=languageCode, task=task, prompt_strategy=prompt_strategy, **decodeOptions)
     def _get_model_path(self, model_config: ModelConfig, root_dir: str = None):
         from src.conversion.hf_converter import convert_hf_whisper
         return model_config.path
 class WhisperCallback(AbstractWhisperCallback):
+    def __init__(self, model_container: WhisperContainer, languageCode: str = None, task: str = None,
                  prompt_strategy: AbstractPromptStrategy = None,
                  **decodeOptions: dict):
         self.model_container = model_container
+        self.languageCode = languageCode
         self.task = task
         self.prompt_strategy = prompt_strategy
                            if self.prompt_strategy else prompt
         result = model.transcribe(audio, \
+            language=self.languageCode if self.languageCode else detected_language, task=self.task, \
             initial_prompt=initial_prompt, \
             **decodeOptions
         )