whisper-webui-translate

Running

App Files Files Community

avans06 commited on Dec 19, 2023

Commit

85d6c89

•

1 Parent(s): d0c7a01

Translation model parameter "Using Bitsandbytes" has been added.

Browse files

Utilizing the Bitsandbytes package, float32 models can be dynamically converted to mixed-8bit or 4bit precision quantized models, effectively reducing VRAM usage. This quantization parameter is only applicable to models that have not been quantized yet and is not suitable for already quantized models like Ctranslate2, GPTQ, and GGUF.

Files changed (9) hide show

app.py +8 -4
config.json5 +71 -68
docs/options.md +5 -1
docs/translateModel.md +15 -11
requirements-fasterWhisper.txt +4 -1
requirements-whisper.txt +4 -1
requirements.txt +4 -1
src/config.py +2 -0
src/translation/translationModel.py +101 -43

app.py CHANGED Viewed

@@ -240,7 +240,8 @@ class WhisperTranscriber:
             translationNoRepeatNgramSize: int  = decodeOptions.pop("translationNoRepeatNgramSize")
             translationNumBeams:          int  = decodeOptions.pop("translationNumBeams")
             translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
             sourceInput:    str  = decodeOptions.pop("sourceInput")
             urlData:        str  = decodeOptions.pop("urlData")
             multipleFiles:  List = decodeOptions.pop("multipleFiles")
@@ -377,7 +378,7 @@ class WhisperTranscriber:
                     translationLang = get_lang_from_m2m100_name(madlad400LangName)
                 if translationLang is not None:
-                    translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16)
                 progress(0, desc="init transcribe")
                 # Result
@@ -937,7 +938,9 @@ def create_ui(app_config: ApplicationConfig):
     mt5_models = app_config.get_model_names("mt5")
     ALMA_models = app_config.get_model_names("ALMA")
     madlad400_models = app_config.get_model_names("madlad400")
-    if not torch.cuda.is_available(): #Load only GGUF and CT2 translation models in pure CPU environments..
         ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
         madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
@@ -970,7 +973,8 @@ def create_ui(app_config: ApplicationConfig):
         gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
         gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
         gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
-        gr.Checkbox(label="Translation - Torch Dtype float16", info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF)", value=app_config.translation_torch_dtype_float16, elem_id="translationTorchDtypeFloat16")
     }
     common_vad_inputs = lambda : {

             translationNoRepeatNgramSize: int  = decodeOptions.pop("translationNoRepeatNgramSize")
             translationNumBeams:          int  = decodeOptions.pop("translationNumBeams")
             translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
+            translationUsingBitsandbytes: str  = decodeOptions.pop("translationUsingBitsandbytes")
             sourceInput:    str  = decodeOptions.pop("sourceInput")
             urlData:        str  = decodeOptions.pop("urlData")
             multipleFiles:  List = decodeOptions.pop("multipleFiles")
                     translationLang = get_lang_from_m2m100_name(madlad400LangName)
                 if translationLang is not None:
+                    translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16, usingBitsandbytes=translationUsingBitsandbytes)
                 progress(0, desc="init transcribe")
                 # Result
     mt5_models = app_config.get_model_names("mt5")
     ALMA_models = app_config.get_model_names("ALMA")
     madlad400_models = app_config.get_model_names("madlad400")
+    if not torch.cuda.is_available(): # Loading only quantized or models with medium-low parameters in an environment without GPU support.
+        nllb_models = list(filter(lambda nllb: any(name in nllb for name in ["-600M", "-1.3B", "-3.3B-ct2"]), nllb_models))
+        m2m100_models = list(filter(lambda m2m100: "12B" not in m2m100, m2m100_models))
         ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
         madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
         gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
         gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
         gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
+        gr.Checkbox(label="Translation - Torch Dtype float16", visible=torch.cuda.is_available(), value=app_config.translation_torch_dtype_float16, info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)", elem_id="translationTorchDtypeFloat16"),
+        gr.Radio(label="Translation - Using Bitsandbytes", visible=torch.cuda.is_available(), choices=[None, "int8", "int4"], value=app_config.translation_using_bitsandbytes, info="Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)", elem_id="translationUsingBitsandbytes"),
     }
     common_vad_inputs = lambda : {

config.json5 CHANGED Viewed

@@ -59,30 +59,41 @@
         "type": "huggingface",
         "tokenizer_url": "facebook/m2m100_1.2B"
       },
       {
         "name": "m2m100_418M-ct2fast/michaelfeil",
         "url": "michaelfeil/ct2fast-m2m100_418M",
         "type": "huggingface",
         "tokenizer_url": "facebook/m2m100_418M"
       },
-      //{
-      //  "name": "m2m100-12B-ct2fast/michaelfeil",
-      //  "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
-      //  "type": "huggingface",
-      //  "tokenizer_url": "facebook/m2m100-12B-last-ckpt"
-      //},
       {
-        "name": "m2m100_1.2B/facebook",
-        "url": "facebook/m2m100_1.2B",
         "type": "huggingface"
       },
       {
-        "name": "m2m100_418M/facebook",
-        "url": "facebook/m2m100_418M",
         "type": "huggingface"
       }
     ],
     "nllb": [
       {
         "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
         "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
@@ -90,10 +101,22 @@
         "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
       },
       {
-        "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
-        "url": "michaelfeil/ct2fast-nllb-200-3.3B",
         "type": "huggingface",
-        "tokenizer_url": "facebook/nllb-200-3.3B"
       },
       {
         "name": "nllb-200-1.3B-ct2:float16/JustFrederik",
@@ -102,22 +125,37 @@
         "tokenizer_url": "facebook/nllb-200-1.3B"
       },
       {
-        "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
-        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
         "type": "huggingface",
-        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
       },
       {
-        "name": "nllb-200-1.3B-ct2:int8/JustFrederik",
-        "url": "JustFrederik/nllb-200-1.3B-ct2-int8",
         "type": "huggingface",
-        "tokenizer_url": "facebook/nllb-200-1.3B"
       },
       {
-        "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
-        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
         "type": "huggingface",
-        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
       },
       {
         "name": "nllb-200-distilled-600M/facebook",
@@ -125,8 +163,8 @@
         "type": "huggingface"
       },
       {
-        "name": "nllb-200-distilled-600M-ct2/JustFrederik",
-        "url": "JustFrederik/nllb-200-distilled-600M-ct2",
         "type": "huggingface",
         "tokenizer_url": "facebook/nllb-200-distilled-600M"
       },
@@ -137,48 +175,11 @@
         "tokenizer_url": "facebook/nllb-200-distilled-600M"
       },
       {
-        "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
-        "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
         "type": "huggingface",
         "tokenizer_url": "facebook/nllb-200-distilled-600M"
       }
-      // Uncomment to add official Facebook 1.3B and 3.3B model
-      // The official Facebook 1.3B and 3.3B model files are too large,
-      //   and to avoid occupying too much disk space on Hugging Face's free spaces,
-      //   these models are not included in the config.
-      //{
-      //  "name": "nllb-200-distilled-1.3B/facebook",
-      //  "url": "facebook/nllb-200-distilled-1.3B",
-      //  "type": "huggingface"
-      //},
-      //{
-      //  "name": "nllb-200-1.3B/facebook",
-      //  "url": "facebook/nllb-200-1.3B",
-      //  "type": "huggingface"
-      //},
-      //{
-      //  "name": "nllb-200-3.3B/facebook",
-      //  "url": "facebook/nllb-200-3.3B",
-      //  "type": "huggingface"
-      //},
-      //{
-      //  "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
-      //  "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
-      //  "type": "huggingface",
-      //  "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
-      //},
-      //{
-      //  "name": "nllb-200-1.3B-ct2/JustFrederik",
-      //  "url": "JustFrederik/nllb-200-1.3B-ct2",
-      //  "type": "huggingface",
-      //  "tokenizer_url": "facebook/nllb-200-1.3B"
-      //},
-      //{
-      //  "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
-      //  "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
-      //  "type": "huggingface",
-      //  "tokenizer_url": "facebook/nllb-200-3.3B"
-      //},
     ],
     "mt5": [
       {
@@ -238,7 +239,7 @@
         "name": "ALMA-13B/haoranxu",
         "url": "haoranxu/ALMA-13B",
         "type": "huggingface"
-      },
     ],
     "madlad400": [
       {
@@ -256,18 +257,18 @@
       {
         "name": "madlad400-3b-mt/jbochi",
         "url": "jbochi/madlad400-3b-mt",
-        "type": "huggingface",
       },
       {
         "name": "madlad400-7b-mt-bt/jbochi",
         "url": "jbochi/madlad400-7b-mt-bt",
-        "type": "huggingface",
       },
       {
         "name": "madlad400-10b-mt/jbochi",
         "url": "jbochi/madlad400-10b-mt",
-        "type": "huggingface",
-      },
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.
@@ -400,4 +401,6 @@
   "translation_num_beams": 2,
   // Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
   "translation_torch_dtype_float16": true,
 }

         "type": "huggingface",
         "tokenizer_url": "facebook/m2m100_1.2B"
       },
+      {
+        "name": "m2m100_1.2B/facebook",
+        "url": "facebook/m2m100_1.2B",
+        "type": "huggingface"
+      },
       {
         "name": "m2m100_418M-ct2fast/michaelfeil",
         "url": "michaelfeil/ct2fast-m2m100_418M",
         "type": "huggingface",
         "tokenizer_url": "facebook/m2m100_418M"
       },
       {
+        "name": "m2m100_418M/facebook",
+        "url": "facebook/m2m100_418M",
         "type": "huggingface"
       },
       {
+        "name": "m2m100-12B-last-ckpt/facebook",
+        "url": "facebook/m2m100-12B-last-ckpt",
         "type": "huggingface"
+      },
+      {
+        "name": "m2m100-12B-ct2fast/michaelfeil",
+        "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/m2m100-12B-last-ckpt"
       }
     ],
     "nllb": [
+      {
+        "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
+      },
       {
         "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
         "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
         "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
       },
       {
+        "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
         "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
+      },
+      {
+        "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
+      },
+      {
+        "name": "nllb-200-1.3B-ct2:int8/JustFrederik",
+        "url": "JustFrederik/nllb-200-1.3B-ct2-int8",
+        "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-1.3B"
       },
       {
         "name": "nllb-200-1.3B-ct2:float16/JustFrederik",
         "tokenizer_url": "facebook/nllb-200-1.3B"
       },
       {
+        "name": "nllb-200-1.3B-ct2/JustFrederik",
+        "url": "JustFrederik/nllb-200-1.3B-ct2",
         "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-1.3B"
       },
       {
+        "name": "nllb-200-distilled-1.3B/facebook",
+        "url": "facebook/nllb-200-distilled-1.3B",
+        "type": "huggingface"
+      },
+      {
+        "name": "nllb-200-1.3B/facebook",
+        "url": "facebook/nllb-200-1.3B",
+        "type": "huggingface"
+      },
+      {
+        "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
+        "url": "michaelfeil/ct2fast-nllb-200-3.3B",
         "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-3.3B"
       },
       {
+        "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
+        "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
         "type": "huggingface",
+        "tokenizer_url": "facebook/nllb-200-3.3B"
+      },
+      {
+        "name": "nllb-200-3.3B/facebook",
+        "url": "facebook/nllb-200-3.3B",
+        "type": "huggingface"
       },
       {
         "name": "nllb-200-distilled-600M/facebook",
         "type": "huggingface"
       },
       {
+        "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
         "type": "huggingface",
         "tokenizer_url": "facebook/nllb-200-distilled-600M"
       },
         "tokenizer_url": "facebook/nllb-200-distilled-600M"
       },
       {
+        "name": "nllb-200-distilled-600M-ct2/JustFrederik",
+        "url": "JustFrederik/nllb-200-distilled-600M-ct2",
         "type": "huggingface",
         "tokenizer_url": "facebook/nllb-200-distilled-600M"
       }
     ],
     "mt5": [
       {
         "name": "ALMA-13B/haoranxu",
         "url": "haoranxu/ALMA-13B",
         "type": "huggingface"
+      }
     ],
     "madlad400": [
       {
       {
         "name": "madlad400-3b-mt/jbochi",
         "url": "jbochi/madlad400-3b-mt",
+        "type": "huggingface"
       },
       {
         "name": "madlad400-7b-mt-bt/jbochi",
         "url": "jbochi/madlad400-7b-mt-bt",
+        "type": "huggingface"
       },
       {
         "name": "madlad400-10b-mt/jbochi",
         "url": "jbochi/madlad400-10b-mt",
+        "type": "huggingface"
+      }
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.
   "translation_num_beams": 2,
   // Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
   "translation_torch_dtype_float16": true,
+  // Translation - Using Bitsandbytes, Load the float32 translation model into mixed-8bit or 4bit precision quantized model(not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
+  "translation_using_bitsandbytes": null
 }

docs/options.md CHANGED Viewed

@@ -204,4 +204,8 @@ Beam size (1 for greedy search).
 ## Translation - Torch Dtype float16
 - transformers: torch_dtype=torch.float16
-Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)

 ## Translation - Torch Dtype float16
 - transformers: torch_dtype=torch.float16
+Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
+## Translation - Using Bitsandbytes
+- transformers: load_in_8bit, load_in_4bit
+Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)

docs/translateModel.md CHANGED Viewed

@@ -22,7 +22,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
 |------|------------|------|---------------|---------------|
 | [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
 | [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
-| [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | N/A |
 ## M2M100-CTranslate2
@@ -133,18 +133,18 @@ madlad400 is a multilingual machine translation model based on the T5 architectu
 ## SeamlessM4T
-SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
-It enables multiple tasks without relying on separate models:
-Speech-to-speech translation (S2ST)
-Speech-to-text translation (S2TT)
-Text-to-speech translation (T2ST)
-Text-to-text translation (T2TT)
-Automatic speech recognition (ASR)
-SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
-SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
@@ -175,4 +175,8 @@ Beam size (1 for greedy search).
 ## Translation - Torch Dtype float16
 - transformers: torch_dtype=torch.float16
-Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)

 |------|------------|------|---------------|---------------|
 | [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
 | [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
+| [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | 22.1 GB (torch dtype in float16) |
 ## M2M100-CTranslate2
 ## SeamlessM4T
+SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
+It enables multiple tasks without relying on separate models:
+Speech-to-speech translation (S2ST)
+Speech-to-text translation (S2TT)
+Text-to-speech translation (T2ST)
+Text-to-text translation (T2TT)
+Automatic speech recognition (ASR)
+SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
+SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
 ## Translation - Torch Dtype float16
 - transformers: torch_dtype=torch.float16
+Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
+## Translation - Using Bitsandbytes
+- transformers: load_in_8bit, load_in_4bit
+Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)

requirements-fasterWhisper.txt CHANGED Viewed

@@ -22,4 +22,7 @@ accelerate
 auto-gptq
 optimum
 # Needed by ALMA-GGUL
-ctransformers[cuda]

 auto-gptq
 optimum
 # Needed by ALMA-GGUL
+ctransformers[cuda]
+# Needed by load_in_4bit parameters in transformers
+bitsandbytes==0.41.2; platform_system != "Windows"
+https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"

requirements-whisper.txt CHANGED Viewed

@@ -21,4 +21,7 @@ accelerate
 auto-gptq
 optimum
 # Needed by ALMA-GGUL
-ctransformers[cuda]

 auto-gptq
 optimum
 # Needed by ALMA-GGUL
+ctransformers[cuda]
+# Needed by load_in_4bit parameters in transformers
+bitsandbytes==0.41.2; platform_system != "Windows"
+https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"

requirements.txt CHANGED Viewed

@@ -22,4 +22,7 @@ accelerate
 auto-gptq
 optimum
 # Needed by ALMA-GGUL
-ctransformers[cuda]

 auto-gptq
 optimum
 # Needed by ALMA-GGUL
+ctransformers[cuda]
+# Needed by load_in_4bit parameters in transformers
+bitsandbytes==0.41.2; platform_system != "Windows"
+https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"

src/config.py CHANGED Viewed

@@ -83,6 +83,7 @@ class ApplicationConfig:
                  translation_no_repeat_ngram_size: int = 3,
                  translation_num_beams: int = 2,
                  translation_torch_dtype_float16: bool = True,
                  # Whisper Segments Filter
                  whisper_segments_filter: bool = False,
                  whisper_segments_filters: List[str] = [],
@@ -152,6 +153,7 @@ class ApplicationConfig:
         self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
         self.translation_num_beams = translation_num_beams
         self.translation_torch_dtype_float16 = translation_torch_dtype_float16
         # Whisper Segments Filter
         self.whisper_segments_filter = whisper_segments_filter
         self.whisper_segments_filters = whisper_segments_filters

                  translation_no_repeat_ngram_size: int = 3,
                  translation_num_beams: int = 2,
                  translation_torch_dtype_float16: bool = True,
+                 translation_using_bitsandbytes: str = None,
                  # Whisper Segments Filter
                  whisper_segments_filter: bool = False,
                  whisper_segments_filters: List[str] = [],
         self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
         self.translation_num_beams = translation_num_beams
         self.translation_torch_dtype_float16 = translation_torch_dtype_float16
+        self.translation_using_bitsandbytes = translation_using_bitsandbytes
         # Whisper Segments Filter
         self.whisper_segments_filter = whisper_segments_filter
         self.whisper_segments_filters = whisper_segments_filters

src/translation/translationModel.py CHANGED Viewed

@@ -22,6 +22,7 @@ class TranslationModel:
         noRepeatNgramSize: int = 3,
         numBeams: int = 2,
         torchDtypeFloat16: bool = True,
         downloadRoot: Optional[str] = None,
         localFilesOnly: bool = False,
         loadModel: bool = False,
@@ -73,7 +74,14 @@ class TranslationModel:
             )
         if device is None:
             if torch.cuda.is_available():
                 device = "cuda" if "ct2" in self.modelPath else "cuda:0"
             else:
                 device = "cpu"
@@ -81,12 +89,30 @@ class TranslationModel:
         self.device = device
         self.torchDtypeFloat16 = torchDtypeFloat16
         if loadModel:
             self.load_model()
     def load_model(self):
         """
         [from_pretrained]
         low_cpu_mem_usage(bool, optional):
             Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
@@ -172,64 +198,96 @@ class TranslationModel:
         """
         try:
             print('\n\nLoading model: %s\n\n' % self.modelPath)
             if "ct2" in self.modelPath:
-                if any(name in self.modelPath for name in ["nllb", "m2m100"]):
                     if "nllb" in self.modelPath:
-                        self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.nllb.code)
                         self.targetPrefix = [self.translationLang.nllb.code]
                     elif "m2m100" in self.modelPath:
-                        self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.m2m100.code)
-                        self.targetPrefix = [self.transTokenizer.lang_code_to_token[self.translationLang.m2m100.code]]
-                    self.transModel = ctranslate2.Translator(self.modelPath, compute_type="auto", device=self.device)
-                elif "ALMA" in self.modelPath:
-                    self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath)
-                    self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
-                    self.transModel = ctranslate2.Generator(self.modelPath, compute_type="auto", device=self.device)
-                elif "madlad400" in self.modelPath:
-                    self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
-                    self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.m2m100.code)
-                    self.transModel = ctranslate2.Translator(self.modelPath, compute_type="auto", device=self.device)
             elif "mt5" in self.modelPath:
                 self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
-                self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
-                self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath, low_cpu_mem_usage=True, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
-                self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
             elif "ALMA" in self.modelPath:
                 self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
-                if "GPTQ" in self.modelPath:
-                    self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
-                    if self.device == "cpu":
-                        # Due to the poor support of GPTQ for CPUs, Therefore, it is strongly discouraged to operate it on CPU.
-                        # set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
-                        transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
-                        transModelConfig.quantization_config["use_exllama"] = False
-                        self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision, config=transModelConfig, torch_dtype=torch.float32)
-                    else:
-                        # transModelConfig.quantization_config["exllama_config"] = {"version":2} # After configuring to use ExLlamaV2, VRAM cannot be effectively released, which may be an issue. Temporarily not adopting the V2 version.
-                        self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision)
-                elif "GGUF" in self.modelPath:
                     import ctransformers
-                    self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
-                    if self.device == "cpu":
-                        self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, low_cpu_mem_usage=True)
-                    else:
-                        self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50, low_cpu_mem_usage=True)
                 else:
-                    self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
-                    self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
-                self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, device=self.device if "GPTQ" not in self.modelPath and "GGUF" not in self.modelPath else None, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
             elif "madlad400" in self.modelPath:
                 self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
-                self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False)
-                self.transModel = transformers.T5ForConditionalGeneration.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
-                self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
             else:
-                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
-                self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
                 if "m2m100" in self.modelPath:
-                    self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
                 else: #NLLB
-                    self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
         except Exception as e:
             self.release_vram()
             raise e

         noRepeatNgramSize: int = 3,
         numBeams: int = 2,
         torchDtypeFloat16: bool = True,
+        usingBitsandbytes: str = None,
         downloadRoot: Optional[str] = None,
         localFilesOnly: bool = False,
         loadModel: bool = False,
             )
         if device is None:
+            self.totalVram = 0
             if torch.cuda.is_available():
+                try:
+                    deviceId = torch.cuda.current_device()
+                    self.totalVram = torch.cuda.get_device_properties(deviceId).total_memory/(1024*1024*1024)
+                except Exception as e:
+                    print(traceback.format_exc())
+                    print("Error detect vram: " + str(e))
                 device = "cuda" if "ct2" in self.modelPath else "cuda:0"
             else:
                 device = "cpu"
         self.device = device
         self.torchDtypeFloat16 = torchDtypeFloat16
+        self.usingBitsandbytes = usingBitsandbytes
         if loadModel:
             self.load_model()
     def load_model(self):
         """
+        [transformers.BitsAndBytesConfig]
+        load_in_8bit (bool, optional, defaults to False)
+            This flag is used to enable 8-bit quantization with LLM.int8().
+        load_in_4bit (bool, optional, defaults to False)
+            This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from bitsandbytes.
+        llm_int8_enable_fp32_cpu_offload (bool, optional, defaults to False)
+            This flag is used for advanced use cases and users that are aware of this feature.
+            If you want to split your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use this flag.
+            This is useful for offloading large models such as google/flan-t5-xxl. Note that the int8 operations will not be run on CPU.
+        bnb_4bit_compute_dtype (torch.dtype or str, optional, defaults to torch.float32)
+            This sets the computational type which might be different than the input time.
+            For example, inputs might be fp32, but computation can be set to bf16 for speedups.
+        bnb_4bit_quant_type (str, optional, defaults to "fp4")
+            This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types which are specified by fp4 or nf4.
+        bnb_4bit_use_double_quant (bool, optional, defaults to False)
+            This flag is used for nested quantization where the quantization constants from the first quantization are quantized again.
         [from_pretrained]
         low_cpu_mem_usage(bool, optional):
             Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
         """
         try:
             print('\n\nLoading model: %s\n\n' % self.modelPath)
+            kwargsTokenizer = {}
+            kwargsModel = {}
+            kwargsPipeline = {}
+            if not any(name in self.modelPath for name in ["ct2", "GGUF", "GPTQ"]):
+                kwargsModel["torch_dtype"] = torch.float16 if self.torchDtypeFloat16 else "auto"
+            if "GPTQ" in self.modelPath:
+                kwargsModel.update({"device_map": "auto"})
+            elif "ct2" in self.modelPath:
+                kwargsModel.update({"device": self.device})
+            elif "GGUF" in self.modelPath:
+                pass
+            elif self.usingBitsandbytes == None:
+                    kwargsPipeline.update({"device": self.device})
+            elif self.usingBitsandbytes == "int8":
+                kwargsModel.update({"load_in_8bit": True, "llm_int8_enable_fp32_cpu_offload": True})
+            elif self.usingBitsandbytes == "int4":
+                kwargsModel.update({"load_in_4bit": True, "llm_int8_enable_fp32_cpu_offload": True,
+                                    "bnb_4bit_use_double_quant": True,
+                                    "bnb_4bit_quant_type": "nf4",
+                                    "bnb_4bit_compute_dtype": torch.bfloat16})
+            if not any(name in self.modelPath for name in ["ct2", "GGUF"]):
+                kwargsModel.update({"pretrained_model_name_or_path": self.modelPath, "low_cpu_mem_usage": True})
             if "ct2" in self.modelPath:
+                kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath})
+                kwargsModel.update({"model_path": self.modelPath, "compute_type": "auto"})
+                if "ALMA" in self.modelPath:
+                    self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
+                    self.transModel = ctranslate2.Generator(**kwargsModel)
+                else:
                     if "nllb" in self.modelPath:
+                        kwargsTokenizer.update({"src_lang": self.whisperLang.nllb.code})
                         self.targetPrefix = [self.translationLang.nllb.code]
                     elif "m2m100" in self.modelPath:
+                        kwargsTokenizer.update({"src_lang": self.whisperLang.m2m100.code})
+                    elif "madlad400" in self.modelPath:
+                        kwargsTokenizer.update({"src_lang": self.whisperLang.m2m100.code})
+                        self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
+                    self.transModel = ctranslate2.Translator(**kwargsModel)
+                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
+                if "m2m100" in self.modelPath:
+                    self.targetPrefix = [self.transTokenizer.lang_code_to_token[self.translationLang.m2m100.code]]
             elif "mt5" in self.modelPath:
                 self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
+                kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "legacy": False})
+                self.transTokenizer = transformers.T5Tokenizer.from_pretrained(**kwargsTokenizer)
+                self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(**kwargsModel)
+                kwargsPipeline.update({"task": "text2text-generation", "model": self.transModel, "tokenizer": self.transTokenizer})
             elif "ALMA" in self.modelPath:
                 self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
+                if "GGUF" in self.modelPath:
+                    kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelConfig.tokenizer_url})
+                    kwargsModel.update({"model_path_or_repo_id": self.modelPath, "hf": True, "model_file": self.modelConfig.model_file, "model_type": "llama"})
+                    if self.totalVram > 2:
+                        kwargsModel.update({"gpu_layers":int(self.totalVram*7)})
                     import ctransformers
+                    self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(**kwargsModel)
                 else:
+                    kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "use_fast": True})
+                    if "GPTQ" in self.modelPath:
+                        kwargsModel.update({"trust_remote_code": False, "revision": self.modelConfig.revision})
+                        if self.device == "cpu":
+                            # Due to the poor support of GPTQ for CPUs, Therefore, it is strongly discouraged to operate it on CPU.
+                            # set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
+                            transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
+                            transModelConfig.quantization_config["use_exllama"] = False
+                            kwargsModel.update({"config": transModelConfig})
+                    self.transModel = transformers.AutoModelForCausalLM.from_pretrained(**kwargsModel)
+                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
+                kwargsPipeline.update({"task": "text-generation", "model": self.transModel, "tokenizer": self.transTokenizer, "do_sample": True, "temperature": 0.7, "top_k": 40, "top_p": 0.95, "repetition_penalty": 1.1})
             elif "madlad400" in self.modelPath:
                 self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
+                kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "legacy": False})
+                self.transTokenizer = transformers.T5Tokenizer.from_pretrained(**kwargsTokenizer)
+                self.transModel = transformers.T5ForConditionalGeneration.from_pretrained(**kwargsModel)
+                kwargsPipeline.update({"task": "text2text-generation", "model": self.transModel, "tokenizer": self.transTokenizer})
             else:
+                kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath})
+                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
+                self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(**kwargsModel)
+                kwargsPipeline.update({"task": "translation", "model": self.transModel, "tokenizer": self.transTokenizer})
                 if "m2m100" in self.modelPath:
+                    kwargsPipeline.update({"src_lang": self.whisperLang.m2m100.code, "tgt_lang": self.translationLang.m2m100.code})
                 else: #NLLB
+                    kwargsPipeline.update({"src_lang": self.whisperLang.nllb.code, "tgt_lang": self.translationLang.nllb.code})
+            if "ct2" not in self.modelPath:
+                self.transTranslator = transformers.pipeline(**kwargsPipeline)
         except Exception as e:
             self.release_vram()
             raise e