avans06 commited on
Commit
85d6c89
1 Parent(s): d0c7a01

Translation model parameter "Using Bitsandbytes" has been added.

Browse files

Utilizing the Bitsandbytes package, float32 models can be dynamically converted to mixed-8bit or 4bit precision quantized models, effectively reducing VRAM usage. This quantization parameter is only applicable to models that have not been quantized yet and is not suitable for already quantized models like Ctranslate2, GPTQ, and GGUF.

app.py CHANGED
@@ -240,7 +240,8 @@ class WhisperTranscriber:
240
  translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
241
  translationNumBeams: int = decodeOptions.pop("translationNumBeams")
242
  translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
243
-
 
244
  sourceInput: str = decodeOptions.pop("sourceInput")
245
  urlData: str = decodeOptions.pop("urlData")
246
  multipleFiles: List = decodeOptions.pop("multipleFiles")
@@ -377,7 +378,7 @@ class WhisperTranscriber:
377
  translationLang = get_lang_from_m2m100_name(madlad400LangName)
378
 
379
  if translationLang is not None:
380
- translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16)
381
 
382
  progress(0, desc="init transcribe")
383
  # Result
@@ -937,7 +938,9 @@ def create_ui(app_config: ApplicationConfig):
937
  mt5_models = app_config.get_model_names("mt5")
938
  ALMA_models = app_config.get_model_names("ALMA")
939
  madlad400_models = app_config.get_model_names("madlad400")
940
- if not torch.cuda.is_available(): #Load only GGUF and CT2 translation models in pure CPU environments..
 
 
941
  ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
942
  madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
943
 
@@ -970,7 +973,8 @@ def create_ui(app_config: ApplicationConfig):
970
  gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
971
  gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
972
  gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
973
- gr.Checkbox(label="Translation - Torch Dtype float16", info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF)", value=app_config.translation_torch_dtype_float16, elem_id="translationTorchDtypeFloat16")
 
974
  }
975
 
976
  common_vad_inputs = lambda : {
 
240
  translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
241
  translationNumBeams: int = decodeOptions.pop("translationNumBeams")
242
  translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
243
+ translationUsingBitsandbytes: str = decodeOptions.pop("translationUsingBitsandbytes")
244
+
245
  sourceInput: str = decodeOptions.pop("sourceInput")
246
  urlData: str = decodeOptions.pop("urlData")
247
  multipleFiles: List = decodeOptions.pop("multipleFiles")
 
378
  translationLang = get_lang_from_m2m100_name(madlad400LangName)
379
 
380
  if translationLang is not None:
381
+ translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16, usingBitsandbytes=translationUsingBitsandbytes)
382
 
383
  progress(0, desc="init transcribe")
384
  # Result
 
938
  mt5_models = app_config.get_model_names("mt5")
939
  ALMA_models = app_config.get_model_names("ALMA")
940
  madlad400_models = app_config.get_model_names("madlad400")
941
+ if not torch.cuda.is_available(): # Loading only quantized or models with medium-low parameters in an environment without GPU support.
942
+ nllb_models = list(filter(lambda nllb: any(name in nllb for name in ["-600M", "-1.3B", "-3.3B-ct2"]), nllb_models))
943
+ m2m100_models = list(filter(lambda m2m100: "12B" not in m2m100, m2m100_models))
944
  ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
945
  madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
946
 
 
973
  gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
974
  gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
975
  gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
976
+ gr.Checkbox(label="Translation - Torch Dtype float16", visible=torch.cuda.is_available(), value=app_config.translation_torch_dtype_float16, info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)", elem_id="translationTorchDtypeFloat16"),
977
+ gr.Radio(label="Translation - Using Bitsandbytes", visible=torch.cuda.is_available(), choices=[None, "int8", "int4"], value=app_config.translation_using_bitsandbytes, info="Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)", elem_id="translationUsingBitsandbytes"),
978
  }
979
 
980
  common_vad_inputs = lambda : {
config.json5 CHANGED
@@ -59,30 +59,41 @@
59
  "type": "huggingface",
60
  "tokenizer_url": "facebook/m2m100_1.2B"
61
  },
 
 
 
 
 
62
  {
63
  "name": "m2m100_418M-ct2fast/michaelfeil",
64
  "url": "michaelfeil/ct2fast-m2m100_418M",
65
  "type": "huggingface",
66
  "tokenizer_url": "facebook/m2m100_418M"
67
  },
68
- //{
69
- // "name": "m2m100-12B-ct2fast/michaelfeil",
70
- // "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
71
- // "type": "huggingface",
72
- // "tokenizer_url": "facebook/m2m100-12B-last-ckpt"
73
- //},
74
  {
75
- "name": "m2m100_1.2B/facebook",
76
- "url": "facebook/m2m100_1.2B",
77
  "type": "huggingface"
78
  },
79
  {
80
- "name": "m2m100_418M/facebook",
81
- "url": "facebook/m2m100_418M",
82
  "type": "huggingface"
 
 
 
 
 
 
83
  }
84
  ],
85
  "nllb": [
 
 
 
 
 
 
86
  {
87
  "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
88
  "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
@@ -90,10 +101,22 @@
90
  "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
91
  },
92
  {
93
- "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
94
- "url": "michaelfeil/ct2fast-nllb-200-3.3B",
95
  "type": "huggingface",
96
- "tokenizer_url": "facebook/nllb-200-3.3B"
 
 
 
 
 
 
 
 
 
 
 
 
97
  },
98
  {
99
  "name": "nllb-200-1.3B-ct2:float16/JustFrederik",
@@ -102,22 +125,37 @@
102
  "tokenizer_url": "facebook/nllb-200-1.3B"
103
  },
104
  {
105
- "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
106
- "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
107
  "type": "huggingface",
108
- "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
109
  },
110
  {
111
- "name": "nllb-200-1.3B-ct2:int8/JustFrederik",
112
- "url": "JustFrederik/nllb-200-1.3B-ct2-int8",
 
 
 
 
 
 
 
 
 
 
113
  "type": "huggingface",
114
- "tokenizer_url": "facebook/nllb-200-1.3B"
115
  },
116
  {
117
- "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
118
- "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
119
  "type": "huggingface",
120
- "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
 
 
 
 
 
121
  },
122
  {
123
  "name": "nllb-200-distilled-600M/facebook",
@@ -125,8 +163,8 @@
125
  "type": "huggingface"
126
  },
127
  {
128
- "name": "nllb-200-distilled-600M-ct2/JustFrederik",
129
- "url": "JustFrederik/nllb-200-distilled-600M-ct2",
130
  "type": "huggingface",
131
  "tokenizer_url": "facebook/nllb-200-distilled-600M"
132
  },
@@ -137,48 +175,11 @@
137
  "tokenizer_url": "facebook/nllb-200-distilled-600M"
138
  },
139
  {
140
- "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
141
- "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
142
  "type": "huggingface",
143
  "tokenizer_url": "facebook/nllb-200-distilled-600M"
144
  }
145
- // Uncomment to add official Facebook 1.3B and 3.3B model
146
- // The official Facebook 1.3B and 3.3B model files are too large,
147
- // and to avoid occupying too much disk space on Hugging Face's free spaces,
148
- // these models are not included in the config.
149
- //{
150
- // "name": "nllb-200-distilled-1.3B/facebook",
151
- // "url": "facebook/nllb-200-distilled-1.3B",
152
- // "type": "huggingface"
153
- //},
154
- //{
155
- // "name": "nllb-200-1.3B/facebook",
156
- // "url": "facebook/nllb-200-1.3B",
157
- // "type": "huggingface"
158
- //},
159
- //{
160
- // "name": "nllb-200-3.3B/facebook",
161
- // "url": "facebook/nllb-200-3.3B",
162
- // "type": "huggingface"
163
- //},
164
- //{
165
- // "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
166
- // "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
167
- // "type": "huggingface",
168
- // "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
169
- //},
170
- //{
171
- // "name": "nllb-200-1.3B-ct2/JustFrederik",
172
- // "url": "JustFrederik/nllb-200-1.3B-ct2",
173
- // "type": "huggingface",
174
- // "tokenizer_url": "facebook/nllb-200-1.3B"
175
- //},
176
- //{
177
- // "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
178
- // "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
179
- // "type": "huggingface",
180
- // "tokenizer_url": "facebook/nllb-200-3.3B"
181
- //},
182
  ],
183
  "mt5": [
184
  {
@@ -238,7 +239,7 @@
238
  "name": "ALMA-13B/haoranxu",
239
  "url": "haoranxu/ALMA-13B",
240
  "type": "huggingface"
241
- },
242
  ],
243
  "madlad400": [
244
  {
@@ -256,18 +257,18 @@
256
  {
257
  "name": "madlad400-3b-mt/jbochi",
258
  "url": "jbochi/madlad400-3b-mt",
259
- "type": "huggingface",
260
  },
261
  {
262
  "name": "madlad400-7b-mt-bt/jbochi",
263
  "url": "jbochi/madlad400-7b-mt-bt",
264
- "type": "huggingface",
265
  },
266
  {
267
  "name": "madlad400-10b-mt/jbochi",
268
  "url": "jbochi/madlad400-10b-mt",
269
- "type": "huggingface",
270
- },
271
  ]
272
  },
273
  // Configuration options that will be used if they are not specified in the command line arguments.
@@ -400,4 +401,6 @@
400
  "translation_num_beams": 2,
401
  // Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
402
  "translation_torch_dtype_float16": true,
 
 
403
  }
 
59
  "type": "huggingface",
60
  "tokenizer_url": "facebook/m2m100_1.2B"
61
  },
62
+ {
63
+ "name": "m2m100_1.2B/facebook",
64
+ "url": "facebook/m2m100_1.2B",
65
+ "type": "huggingface"
66
+ },
67
  {
68
  "name": "m2m100_418M-ct2fast/michaelfeil",
69
  "url": "michaelfeil/ct2fast-m2m100_418M",
70
  "type": "huggingface",
71
  "tokenizer_url": "facebook/m2m100_418M"
72
  },
 
 
 
 
 
 
73
  {
74
+ "name": "m2m100_418M/facebook",
75
+ "url": "facebook/m2m100_418M",
76
  "type": "huggingface"
77
  },
78
  {
79
+ "name": "m2m100-12B-last-ckpt/facebook",
80
+ "url": "facebook/m2m100-12B-last-ckpt",
81
  "type": "huggingface"
82
+ },
83
+ {
84
+ "name": "m2m100-12B-ct2fast/michaelfeil",
85
+ "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
86
+ "type": "huggingface",
87
+ "tokenizer_url": "facebook/m2m100-12B-last-ckpt"
88
  }
89
  ],
90
  "nllb": [
91
+ {
92
+ "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
93
+ "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
94
+ "type": "huggingface",
95
+ "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
96
+ },
97
  {
98
  "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
99
  "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
 
101
  "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
102
  },
103
  {
104
+ "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
105
+ "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
106
  "type": "huggingface",
107
+ "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
108
+ },
109
+ {
110
+ "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
111
+ "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
112
+ "type": "huggingface",
113
+ "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
114
+ },
115
+ {
116
+ "name": "nllb-200-1.3B-ct2:int8/JustFrederik",
117
+ "url": "JustFrederik/nllb-200-1.3B-ct2-int8",
118
+ "type": "huggingface",
119
+ "tokenizer_url": "facebook/nllb-200-1.3B"
120
  },
121
  {
122
  "name": "nllb-200-1.3B-ct2:float16/JustFrederik",
 
125
  "tokenizer_url": "facebook/nllb-200-1.3B"
126
  },
127
  {
128
+ "name": "nllb-200-1.3B-ct2/JustFrederik",
129
+ "url": "JustFrederik/nllb-200-1.3B-ct2",
130
  "type": "huggingface",
131
+ "tokenizer_url": "facebook/nllb-200-1.3B"
132
  },
133
  {
134
+ "name": "nllb-200-distilled-1.3B/facebook",
135
+ "url": "facebook/nllb-200-distilled-1.3B",
136
+ "type": "huggingface"
137
+ },
138
+ {
139
+ "name": "nllb-200-1.3B/facebook",
140
+ "url": "facebook/nllb-200-1.3B",
141
+ "type": "huggingface"
142
+ },
143
+ {
144
+ "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
145
+ "url": "michaelfeil/ct2fast-nllb-200-3.3B",
146
  "type": "huggingface",
147
+ "tokenizer_url": "facebook/nllb-200-3.3B"
148
  },
149
  {
150
+ "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
151
+ "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
152
  "type": "huggingface",
153
+ "tokenizer_url": "facebook/nllb-200-3.3B"
154
+ },
155
+ {
156
+ "name": "nllb-200-3.3B/facebook",
157
+ "url": "facebook/nllb-200-3.3B",
158
+ "type": "huggingface"
159
  },
160
  {
161
  "name": "nllb-200-distilled-600M/facebook",
 
163
  "type": "huggingface"
164
  },
165
  {
166
+ "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
167
+ "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
168
  "type": "huggingface",
169
  "tokenizer_url": "facebook/nllb-200-distilled-600M"
170
  },
 
175
  "tokenizer_url": "facebook/nllb-200-distilled-600M"
176
  },
177
  {
178
+ "name": "nllb-200-distilled-600M-ct2/JustFrederik",
179
+ "url": "JustFrederik/nllb-200-distilled-600M-ct2",
180
  "type": "huggingface",
181
  "tokenizer_url": "facebook/nllb-200-distilled-600M"
182
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  ],
184
  "mt5": [
185
  {
 
239
  "name": "ALMA-13B/haoranxu",
240
  "url": "haoranxu/ALMA-13B",
241
  "type": "huggingface"
242
+ }
243
  ],
244
  "madlad400": [
245
  {
 
257
  {
258
  "name": "madlad400-3b-mt/jbochi",
259
  "url": "jbochi/madlad400-3b-mt",
260
+ "type": "huggingface"
261
  },
262
  {
263
  "name": "madlad400-7b-mt-bt/jbochi",
264
  "url": "jbochi/madlad400-7b-mt-bt",
265
+ "type": "huggingface"
266
  },
267
  {
268
  "name": "madlad400-10b-mt/jbochi",
269
  "url": "jbochi/madlad400-10b-mt",
270
+ "type": "huggingface"
271
+ }
272
  ]
273
  },
274
  // Configuration options that will be used if they are not specified in the command line arguments.
 
401
  "translation_num_beams": 2,
402
  // Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
403
  "translation_torch_dtype_float16": true,
404
+ // Translation - Using Bitsandbytes, Load the float32 translation model into mixed-8bit or 4bit precision quantized model(not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
405
+ "translation_using_bitsandbytes": null
406
  }
docs/options.md CHANGED
@@ -204,4 +204,8 @@ Beam size (1 for greedy search).
204
 
205
  ## Translation - Torch Dtype float16
206
  - transformers: torch_dtype=torch.float16
207
- Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)
 
 
 
 
 
204
 
205
  ## Translation - Torch Dtype float16
206
  - transformers: torch_dtype=torch.float16
207
+ Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
208
+
209
+ ## Translation - Using Bitsandbytes
210
+ - transformers: load_in_8bit, load_in_4bit
211
+ Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
docs/translateModel.md CHANGED
@@ -22,7 +22,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
22
  |------|------------|------|---------------|---------------|
23
  | [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
24
  | [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
25
- | [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | N/A |
26
 
27
  ## M2M100-CTranslate2
28
 
@@ -133,18 +133,18 @@ madlad400 is a multilingual machine translation model based on the T5 architectu
133
 
134
  ## SeamlessM4T
135
 
136
- SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
137
 
138
- It enables multiple tasks without relying on separate models:
139
 
140
- Speech-to-speech translation (S2ST)
141
- Speech-to-text translation (S2TT)
142
- Text-to-speech translation (T2ST)
143
- Text-to-text translation (T2TT)
144
- Automatic speech recognition (ASR)
145
 
146
- SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
147
- SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
148
 
149
  | Name | Parameters | Size | type/quantize | Required VRAM |
150
  |------|------------|------|---------------|---------------|
@@ -175,4 +175,8 @@ Beam size (1 for greedy search).
175
 
176
  ## Translation - Torch Dtype float16
177
  - transformers: torch_dtype=torch.float16
178
- Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)
 
 
 
 
 
22
  |------|------------|------|---------------|---------------|
23
  | [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
24
  | [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
25
+ | [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | 22.1 GB (torch dtype in float16) |
26
 
27
  ## M2M100-CTranslate2
28
 
 
133
 
134
  ## SeamlessM4T
135
 
136
+ SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
137
 
138
+ It enables multiple tasks without relying on separate models:
139
 
140
+ Speech-to-speech translation (S2ST)
141
+ Speech-to-text translation (S2TT)
142
+ Text-to-speech translation (T2ST)
143
+ Text-to-text translation (T2TT)
144
+ Automatic speech recognition (ASR)
145
 
146
+ SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
147
+ SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
148
 
149
  | Name | Parameters | Size | type/quantize | Required VRAM |
150
  |------|------------|------|---------------|---------------|
 
175
 
176
  ## Translation - Torch Dtype float16
177
  - transformers: torch_dtype=torch.float16
178
+ Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
179
+
180
+ ## Translation - Using Bitsandbytes
181
+ - transformers: load_in_8bit, load_in_4bit
182
+ Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
requirements-fasterWhisper.txt CHANGED
@@ -22,4 +22,7 @@ accelerate
22
  auto-gptq
23
  optimum
24
  # Needed by ALMA-GGUL
25
- ctransformers[cuda]
 
 
 
 
22
  auto-gptq
23
  optimum
24
  # Needed by ALMA-GGUL
25
+ ctransformers[cuda]
26
+ # Needed by load_in_4bit parameters in transformers
27
+ bitsandbytes==0.41.2; platform_system != "Windows"
28
+ https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"
requirements-whisper.txt CHANGED
@@ -21,4 +21,7 @@ accelerate
21
  auto-gptq
22
  optimum
23
  # Needed by ALMA-GGUL
24
- ctransformers[cuda]
 
 
 
 
21
  auto-gptq
22
  optimum
23
  # Needed by ALMA-GGUL
24
+ ctransformers[cuda]
25
+ # Needed by load_in_4bit parameters in transformers
26
+ bitsandbytes==0.41.2; platform_system != "Windows"
27
+ https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"
requirements.txt CHANGED
@@ -22,4 +22,7 @@ accelerate
22
  auto-gptq
23
  optimum
24
  # Needed by ALMA-GGUL
25
- ctransformers[cuda]
 
 
 
 
22
  auto-gptq
23
  optimum
24
  # Needed by ALMA-GGUL
25
+ ctransformers[cuda]
26
+ # Needed by load_in_4bit parameters in transformers
27
+ bitsandbytes==0.41.2; platform_system != "Windows"
28
+ https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"
src/config.py CHANGED
@@ -83,6 +83,7 @@ class ApplicationConfig:
83
  translation_no_repeat_ngram_size: int = 3,
84
  translation_num_beams: int = 2,
85
  translation_torch_dtype_float16: bool = True,
 
86
  # Whisper Segments Filter
87
  whisper_segments_filter: bool = False,
88
  whisper_segments_filters: List[str] = [],
@@ -152,6 +153,7 @@ class ApplicationConfig:
152
  self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
153
  self.translation_num_beams = translation_num_beams
154
  self.translation_torch_dtype_float16 = translation_torch_dtype_float16
 
155
  # Whisper Segments Filter
156
  self.whisper_segments_filter = whisper_segments_filter
157
  self.whisper_segments_filters = whisper_segments_filters
 
83
  translation_no_repeat_ngram_size: int = 3,
84
  translation_num_beams: int = 2,
85
  translation_torch_dtype_float16: bool = True,
86
+ translation_using_bitsandbytes: str = None,
87
  # Whisper Segments Filter
88
  whisper_segments_filter: bool = False,
89
  whisper_segments_filters: List[str] = [],
 
153
  self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
154
  self.translation_num_beams = translation_num_beams
155
  self.translation_torch_dtype_float16 = translation_torch_dtype_float16
156
+ self.translation_using_bitsandbytes = translation_using_bitsandbytes
157
  # Whisper Segments Filter
158
  self.whisper_segments_filter = whisper_segments_filter
159
  self.whisper_segments_filters = whisper_segments_filters
src/translation/translationModel.py CHANGED
@@ -22,6 +22,7 @@ class TranslationModel:
22
  noRepeatNgramSize: int = 3,
23
  numBeams: int = 2,
24
  torchDtypeFloat16: bool = True,
 
25
  downloadRoot: Optional[str] = None,
26
  localFilesOnly: bool = False,
27
  loadModel: bool = False,
@@ -73,7 +74,14 @@ class TranslationModel:
73
  )
74
 
75
  if device is None:
 
76
  if torch.cuda.is_available():
 
 
 
 
 
 
77
  device = "cuda" if "ct2" in self.modelPath else "cuda:0"
78
  else:
79
  device = "cpu"
@@ -81,12 +89,30 @@ class TranslationModel:
81
 
82
  self.device = device
83
  self.torchDtypeFloat16 = torchDtypeFloat16
 
84
 
85
  if loadModel:
86
  self.load_model()
87
 
88
  def load_model(self):
89
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  [from_pretrained]
91
  low_cpu_mem_usage(bool, optional):
92
  Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
@@ -172,64 +198,96 @@ class TranslationModel:
172
  """
173
  try:
174
  print('\n\nLoading model: %s\n\n' % self.modelPath)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  if "ct2" in self.modelPath:
176
- if any(name in self.modelPath for name in ["nllb", "m2m100"]):
 
 
 
 
 
177
  if "nllb" in self.modelPath:
178
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.nllb.code)
179
  self.targetPrefix = [self.translationLang.nllb.code]
180
  elif "m2m100" in self.modelPath:
181
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.m2m100.code)
182
- self.targetPrefix = [self.transTokenizer.lang_code_to_token[self.translationLang.m2m100.code]]
183
- self.transModel = ctranslate2.Translator(self.modelPath, compute_type="auto", device=self.device)
184
- elif "ALMA" in self.modelPath:
185
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath)
186
- self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
187
- self.transModel = ctranslate2.Generator(self.modelPath, compute_type="auto", device=self.device)
188
- elif "madlad400" in self.modelPath:
189
- self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
190
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.m2m100.code)
191
- self.transModel = ctranslate2.Translator(self.modelPath, compute_type="auto", device=self.device)
192
  elif "mt5" in self.modelPath:
193
  self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
194
- self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
195
- self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath, low_cpu_mem_usage=True, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
196
- self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
 
197
  elif "ALMA" in self.modelPath:
198
  self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
199
- if "GPTQ" in self.modelPath:
200
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
201
- if self.device == "cpu":
202
- # Due to the poor support of GPTQ for CPUs, Therefore, it is strongly discouraged to operate it on CPU.
203
- # set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
204
- transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
205
- transModelConfig.quantization_config["use_exllama"] = False
206
- self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision, config=transModelConfig, torch_dtype=torch.float32)
207
- else:
208
- # transModelConfig.quantization_config["exllama_config"] = {"version":2} # After configuring to use ExLlamaV2, VRAM cannot be effectively released, which may be an issue. Temporarily not adopting the V2 version.
209
- self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision)
210
- elif "GGUF" in self.modelPath:
211
  import ctransformers
212
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
213
- if self.device == "cpu":
214
- self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, low_cpu_mem_usage=True)
215
- else:
216
- self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50, low_cpu_mem_usage=True)
217
  else:
218
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
219
- self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
220
- self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, device=self.device if "GPTQ" not in self.modelPath and "GGUF" not in self.modelPath else None, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
 
 
 
 
 
 
 
 
 
221
  elif "madlad400" in self.modelPath:
222
  self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
223
- self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False)
224
- self.transModel = transformers.T5ForConditionalGeneration.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
225
- self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
 
226
  else:
227
- self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
228
- self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
 
 
229
  if "m2m100" in self.modelPath:
230
- self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
231
  else: #NLLB
232
- self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
 
 
233
  except Exception as e:
234
  self.release_vram()
235
  raise e
 
22
  noRepeatNgramSize: int = 3,
23
  numBeams: int = 2,
24
  torchDtypeFloat16: bool = True,
25
+ usingBitsandbytes: str = None,
26
  downloadRoot: Optional[str] = None,
27
  localFilesOnly: bool = False,
28
  loadModel: bool = False,
 
74
  )
75
 
76
  if device is None:
77
+ self.totalVram = 0
78
  if torch.cuda.is_available():
79
+ try:
80
+ deviceId = torch.cuda.current_device()
81
+ self.totalVram = torch.cuda.get_device_properties(deviceId).total_memory/(1024*1024*1024)
82
+ except Exception as e:
83
+ print(traceback.format_exc())
84
+ print("Error detect vram: " + str(e))
85
  device = "cuda" if "ct2" in self.modelPath else "cuda:0"
86
  else:
87
  device = "cpu"
 
89
 
90
  self.device = device
91
  self.torchDtypeFloat16 = torchDtypeFloat16
92
+ self.usingBitsandbytes = usingBitsandbytes
93
 
94
  if loadModel:
95
  self.load_model()
96
 
97
  def load_model(self):
98
  """
99
+ [transformers.BitsAndBytesConfig]
100
+ load_in_8bit (bool, optional, defaults to False)
101
+ This flag is used to enable 8-bit quantization with LLM.int8().
102
+ load_in_4bit (bool, optional, defaults to False)
103
+ This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from bitsandbytes.
104
+ llm_int8_enable_fp32_cpu_offload (bool, optional, defaults to False)
105
+ This flag is used for advanced use cases and users that are aware of this feature.
106
+ If you want to split your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use this flag.
107
+ This is useful for offloading large models such as google/flan-t5-xxl. Note that the int8 operations will not be run on CPU.
108
+ bnb_4bit_compute_dtype (torch.dtype or str, optional, defaults to torch.float32)
109
+ This sets the computational type which might be different than the input time.
110
+ For example, inputs might be fp32, but computation can be set to bf16 for speedups.
111
+ bnb_4bit_quant_type (str, optional, defaults to "fp4")
112
+ This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types which are specified by fp4 or nf4.
113
+ bnb_4bit_use_double_quant (bool, optional, defaults to False)
114
+ This flag is used for nested quantization where the quantization constants from the first quantization are quantized again.
115
+
116
  [from_pretrained]
117
  low_cpu_mem_usage(bool, optional):
118
  Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
 
198
  """
199
  try:
200
  print('\n\nLoading model: %s\n\n' % self.modelPath)
201
+ kwargsTokenizer = {}
202
+ kwargsModel = {}
203
+ kwargsPipeline = {}
204
+
205
+ if not any(name in self.modelPath for name in ["ct2", "GGUF", "GPTQ"]):
206
+ kwargsModel["torch_dtype"] = torch.float16 if self.torchDtypeFloat16 else "auto"
207
+
208
+ if "GPTQ" in self.modelPath:
209
+ kwargsModel.update({"device_map": "auto"})
210
+ elif "ct2" in self.modelPath:
211
+ kwargsModel.update({"device": self.device})
212
+ elif "GGUF" in self.modelPath:
213
+ pass
214
+ elif self.usingBitsandbytes == None:
215
+ kwargsPipeline.update({"device": self.device})
216
+ elif self.usingBitsandbytes == "int8":
217
+ kwargsModel.update({"load_in_8bit": True, "llm_int8_enable_fp32_cpu_offload": True})
218
+ elif self.usingBitsandbytes == "int4":
219
+ kwargsModel.update({"load_in_4bit": True, "llm_int8_enable_fp32_cpu_offload": True,
220
+ "bnb_4bit_use_double_quant": True,
221
+ "bnb_4bit_quant_type": "nf4",
222
+ "bnb_4bit_compute_dtype": torch.bfloat16})
223
+
224
+ if not any(name in self.modelPath for name in ["ct2", "GGUF"]):
225
+ kwargsModel.update({"pretrained_model_name_or_path": self.modelPath, "low_cpu_mem_usage": True})
226
+
227
  if "ct2" in self.modelPath:
228
+ kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath})
229
+ kwargsModel.update({"model_path": self.modelPath, "compute_type": "auto"})
230
+ if "ALMA" in self.modelPath:
231
+ self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
232
+ self.transModel = ctranslate2.Generator(**kwargsModel)
233
+ else:
234
  if "nllb" in self.modelPath:
235
+ kwargsTokenizer.update({"src_lang": self.whisperLang.nllb.code})
236
  self.targetPrefix = [self.translationLang.nllb.code]
237
  elif "m2m100" in self.modelPath:
238
+ kwargsTokenizer.update({"src_lang": self.whisperLang.m2m100.code})
239
+ elif "madlad400" in self.modelPath:
240
+ kwargsTokenizer.update({"src_lang": self.whisperLang.m2m100.code})
241
+ self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
242
+ self.transModel = ctranslate2.Translator(**kwargsModel)
243
+ self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
244
+ if "m2m100" in self.modelPath:
245
+ self.targetPrefix = [self.transTokenizer.lang_code_to_token[self.translationLang.m2m100.code]]
 
 
 
246
  elif "mt5" in self.modelPath:
247
  self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
248
+ kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "legacy": False})
249
+ self.transTokenizer = transformers.T5Tokenizer.from_pretrained(**kwargsTokenizer)
250
+ self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(**kwargsModel)
251
+ kwargsPipeline.update({"task": "text2text-generation", "model": self.transModel, "tokenizer": self.transTokenizer})
252
  elif "ALMA" in self.modelPath:
253
  self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
254
+ if "GGUF" in self.modelPath:
255
+ kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelConfig.tokenizer_url})
256
+ kwargsModel.update({"model_path_or_repo_id": self.modelPath, "hf": True, "model_file": self.modelConfig.model_file, "model_type": "llama"})
257
+ if self.totalVram > 2:
258
+ kwargsModel.update({"gpu_layers":int(self.totalVram*7)})
 
 
 
 
 
 
 
259
  import ctransformers
260
+ self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(**kwargsModel)
 
 
 
 
261
  else:
262
+ kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "use_fast": True})
263
+ if "GPTQ" in self.modelPath:
264
+ kwargsModel.update({"trust_remote_code": False, "revision": self.modelConfig.revision})
265
+ if self.device == "cpu":
266
+ # Due to the poor support of GPTQ for CPUs, Therefore, it is strongly discouraged to operate it on CPU.
267
+ # set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
268
+ transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
269
+ transModelConfig.quantization_config["use_exllama"] = False
270
+ kwargsModel.update({"config": transModelConfig})
271
+ self.transModel = transformers.AutoModelForCausalLM.from_pretrained(**kwargsModel)
272
+ self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
273
+ kwargsPipeline.update({"task": "text-generation", "model": self.transModel, "tokenizer": self.transTokenizer, "do_sample": True, "temperature": 0.7, "top_k": 40, "top_p": 0.95, "repetition_penalty": 1.1})
274
  elif "madlad400" in self.modelPath:
275
  self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
276
+ kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "legacy": False})
277
+ self.transTokenizer = transformers.T5Tokenizer.from_pretrained(**kwargsTokenizer)
278
+ self.transModel = transformers.T5ForConditionalGeneration.from_pretrained(**kwargsModel)
279
+ kwargsPipeline.update({"task": "text2text-generation", "model": self.transModel, "tokenizer": self.transTokenizer})
280
  else:
281
+ kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath})
282
+ self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
283
+ self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(**kwargsModel)
284
+ kwargsPipeline.update({"task": "translation", "model": self.transModel, "tokenizer": self.transTokenizer})
285
  if "m2m100" in self.modelPath:
286
+ kwargsPipeline.update({"src_lang": self.whisperLang.m2m100.code, "tgt_lang": self.translationLang.m2m100.code})
287
  else: #NLLB
288
+ kwargsPipeline.update({"src_lang": self.whisperLang.nllb.code, "tgt_lang": self.translationLang.nllb.code})
289
+ if "ct2" not in self.modelPath:
290
+ self.transTranslator = transformers.pipeline(**kwargsPipeline)
291
  except Exception as e:
292
  self.release_vram()
293
  raise e