Translation model parameter "Using Bitsandbytes" has been added.
Browse filesUtilizing the Bitsandbytes package, float32 models can be dynamically converted to mixed-8bit or 4bit precision quantized models, effectively reducing VRAM usage. This quantization parameter is only applicable to models that have not been quantized yet and is not suitable for already quantized models like Ctranslate2, GPTQ, and GGUF.
- app.py +8 -4
- config.json5 +71 -68
- docs/options.md +5 -1
- docs/translateModel.md +15 -11
- requirements-fasterWhisper.txt +4 -1
- requirements-whisper.txt +4 -1
- requirements.txt +4 -1
- src/config.py +2 -0
- src/translation/translationModel.py +101 -43
app.py
CHANGED
@@ -240,7 +240,8 @@ class WhisperTranscriber:
|
|
240 |
translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
|
241 |
translationNumBeams: int = decodeOptions.pop("translationNumBeams")
|
242 |
translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
|
243 |
-
|
|
|
244 |
sourceInput: str = decodeOptions.pop("sourceInput")
|
245 |
urlData: str = decodeOptions.pop("urlData")
|
246 |
multipleFiles: List = decodeOptions.pop("multipleFiles")
|
@@ -377,7 +378,7 @@ class WhisperTranscriber:
|
|
377 |
translationLang = get_lang_from_m2m100_name(madlad400LangName)
|
378 |
|
379 |
if translationLang is not None:
|
380 |
-
translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16)
|
381 |
|
382 |
progress(0, desc="init transcribe")
|
383 |
# Result
|
@@ -937,7 +938,9 @@ def create_ui(app_config: ApplicationConfig):
|
|
937 |
mt5_models = app_config.get_model_names("mt5")
|
938 |
ALMA_models = app_config.get_model_names("ALMA")
|
939 |
madlad400_models = app_config.get_model_names("madlad400")
|
940 |
-
if not torch.cuda.is_available(): #
|
|
|
|
|
941 |
ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
|
942 |
madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
|
943 |
|
@@ -970,7 +973,8 @@ def create_ui(app_config: ApplicationConfig):
|
|
970 |
gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
|
971 |
gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
|
972 |
gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
|
973 |
-
gr.Checkbox(label="Translation - Torch Dtype float16", info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized
|
|
|
974 |
}
|
975 |
|
976 |
common_vad_inputs = lambda : {
|
|
|
240 |
translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
|
241 |
translationNumBeams: int = decodeOptions.pop("translationNumBeams")
|
242 |
translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
|
243 |
+
translationUsingBitsandbytes: str = decodeOptions.pop("translationUsingBitsandbytes")
|
244 |
+
|
245 |
sourceInput: str = decodeOptions.pop("sourceInput")
|
246 |
urlData: str = decodeOptions.pop("urlData")
|
247 |
multipleFiles: List = decodeOptions.pop("multipleFiles")
|
|
|
378 |
translationLang = get_lang_from_m2m100_name(madlad400LangName)
|
379 |
|
380 |
if translationLang is not None:
|
381 |
+
translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16, usingBitsandbytes=translationUsingBitsandbytes)
|
382 |
|
383 |
progress(0, desc="init transcribe")
|
384 |
# Result
|
|
|
938 |
mt5_models = app_config.get_model_names("mt5")
|
939 |
ALMA_models = app_config.get_model_names("ALMA")
|
940 |
madlad400_models = app_config.get_model_names("madlad400")
|
941 |
+
if not torch.cuda.is_available(): # Loading only quantized or models with medium-low parameters in an environment without GPU support.
|
942 |
+
nllb_models = list(filter(lambda nllb: any(name in nllb for name in ["-600M", "-1.3B", "-3.3B-ct2"]), nllb_models))
|
943 |
+
m2m100_models = list(filter(lambda m2m100: "12B" not in m2m100, m2m100_models))
|
944 |
ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
|
945 |
madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
|
946 |
|
|
|
973 |
gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
|
974 |
gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
|
975 |
gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
|
976 |
+
gr.Checkbox(label="Translation - Torch Dtype float16", visible=torch.cuda.is_available(), value=app_config.translation_torch_dtype_float16, info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)", elem_id="translationTorchDtypeFloat16"),
|
977 |
+
gr.Radio(label="Translation - Using Bitsandbytes", visible=torch.cuda.is_available(), choices=[None, "int8", "int4"], value=app_config.translation_using_bitsandbytes, info="Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)", elem_id="translationUsingBitsandbytes"),
|
978 |
}
|
979 |
|
980 |
common_vad_inputs = lambda : {
|
config.json5
CHANGED
@@ -59,30 +59,41 @@
|
|
59 |
"type": "huggingface",
|
60 |
"tokenizer_url": "facebook/m2m100_1.2B"
|
61 |
},
|
|
|
|
|
|
|
|
|
|
|
62 |
{
|
63 |
"name": "m2m100_418M-ct2fast/michaelfeil",
|
64 |
"url": "michaelfeil/ct2fast-m2m100_418M",
|
65 |
"type": "huggingface",
|
66 |
"tokenizer_url": "facebook/m2m100_418M"
|
67 |
},
|
68 |
-
//{
|
69 |
-
// "name": "m2m100-12B-ct2fast/michaelfeil",
|
70 |
-
// "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
|
71 |
-
// "type": "huggingface",
|
72 |
-
// "tokenizer_url": "facebook/m2m100-12B-last-ckpt"
|
73 |
-
//},
|
74 |
{
|
75 |
-
"name": "
|
76 |
-
"url": "facebook/
|
77 |
"type": "huggingface"
|
78 |
},
|
79 |
{
|
80 |
-
"name": "
|
81 |
-
"url": "facebook/
|
82 |
"type": "huggingface"
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
}
|
84 |
],
|
85 |
"nllb": [
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
{
|
87 |
"name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
|
88 |
"url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
|
@@ -90,10 +101,22 @@
|
|
90 |
"tokenizer_url": "facebook/nllb-200-distilled-1.3B"
|
91 |
},
|
92 |
{
|
93 |
-
"name": "nllb-200-
|
94 |
-
"url": "
|
95 |
"type": "huggingface",
|
96 |
-
"tokenizer_url": "facebook/nllb-200-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
},
|
98 |
{
|
99 |
"name": "nllb-200-1.3B-ct2:float16/JustFrederik",
|
@@ -102,22 +125,37 @@
|
|
102 |
"tokenizer_url": "facebook/nllb-200-1.3B"
|
103 |
},
|
104 |
{
|
105 |
-
"name": "nllb-200-
|
106 |
-
"url": "JustFrederik/nllb-200-
|
107 |
"type": "huggingface",
|
108 |
-
"tokenizer_url": "facebook/nllb-200-
|
109 |
},
|
110 |
{
|
111 |
-
"name": "nllb-200-1.3B
|
112 |
-
"url": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
"type": "huggingface",
|
114 |
-
"tokenizer_url": "facebook/nllb-200-
|
115 |
},
|
116 |
{
|
117 |
-
"name": "nllb-200-
|
118 |
-
"url": "JustFrederik/nllb-200-
|
119 |
"type": "huggingface",
|
120 |
-
"tokenizer_url": "facebook/nllb-200-
|
|
|
|
|
|
|
|
|
|
|
121 |
},
|
122 |
{
|
123 |
"name": "nllb-200-distilled-600M/facebook",
|
@@ -125,8 +163,8 @@
|
|
125 |
"type": "huggingface"
|
126 |
},
|
127 |
{
|
128 |
-
"name": "nllb-200-distilled-600M-ct2/JustFrederik",
|
129 |
-
"url": "JustFrederik/nllb-200-distilled-600M-ct2",
|
130 |
"type": "huggingface",
|
131 |
"tokenizer_url": "facebook/nllb-200-distilled-600M"
|
132 |
},
|
@@ -137,48 +175,11 @@
|
|
137 |
"tokenizer_url": "facebook/nllb-200-distilled-600M"
|
138 |
},
|
139 |
{
|
140 |
-
"name": "nllb-200-distilled-600M-ct2
|
141 |
-
"url": "JustFrederik/nllb-200-distilled-600M-ct2
|
142 |
"type": "huggingface",
|
143 |
"tokenizer_url": "facebook/nllb-200-distilled-600M"
|
144 |
}
|
145 |
-
// Uncomment to add official Facebook 1.3B and 3.3B model
|
146 |
-
// The official Facebook 1.3B and 3.3B model files are too large,
|
147 |
-
// and to avoid occupying too much disk space on Hugging Face's free spaces,
|
148 |
-
// these models are not included in the config.
|
149 |
-
//{
|
150 |
-
// "name": "nllb-200-distilled-1.3B/facebook",
|
151 |
-
// "url": "facebook/nllb-200-distilled-1.3B",
|
152 |
-
// "type": "huggingface"
|
153 |
-
//},
|
154 |
-
//{
|
155 |
-
// "name": "nllb-200-1.3B/facebook",
|
156 |
-
// "url": "facebook/nllb-200-1.3B",
|
157 |
-
// "type": "huggingface"
|
158 |
-
//},
|
159 |
-
//{
|
160 |
-
// "name": "nllb-200-3.3B/facebook",
|
161 |
-
// "url": "facebook/nllb-200-3.3B",
|
162 |
-
// "type": "huggingface"
|
163 |
-
//},
|
164 |
-
//{
|
165 |
-
// "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
|
166 |
-
// "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
|
167 |
-
// "type": "huggingface",
|
168 |
-
// "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
|
169 |
-
//},
|
170 |
-
//{
|
171 |
-
// "name": "nllb-200-1.3B-ct2/JustFrederik",
|
172 |
-
// "url": "JustFrederik/nllb-200-1.3B-ct2",
|
173 |
-
// "type": "huggingface",
|
174 |
-
// "tokenizer_url": "facebook/nllb-200-1.3B"
|
175 |
-
//},
|
176 |
-
//{
|
177 |
-
// "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
|
178 |
-
// "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
|
179 |
-
// "type": "huggingface",
|
180 |
-
// "tokenizer_url": "facebook/nllb-200-3.3B"
|
181 |
-
//},
|
182 |
],
|
183 |
"mt5": [
|
184 |
{
|
@@ -238,7 +239,7 @@
|
|
238 |
"name": "ALMA-13B/haoranxu",
|
239 |
"url": "haoranxu/ALMA-13B",
|
240 |
"type": "huggingface"
|
241 |
-
}
|
242 |
],
|
243 |
"madlad400": [
|
244 |
{
|
@@ -256,18 +257,18 @@
|
|
256 |
{
|
257 |
"name": "madlad400-3b-mt/jbochi",
|
258 |
"url": "jbochi/madlad400-3b-mt",
|
259 |
-
"type": "huggingface"
|
260 |
},
|
261 |
{
|
262 |
"name": "madlad400-7b-mt-bt/jbochi",
|
263 |
"url": "jbochi/madlad400-7b-mt-bt",
|
264 |
-
"type": "huggingface"
|
265 |
},
|
266 |
{
|
267 |
"name": "madlad400-10b-mt/jbochi",
|
268 |
"url": "jbochi/madlad400-10b-mt",
|
269 |
-
"type": "huggingface"
|
270 |
-
}
|
271 |
]
|
272 |
},
|
273 |
// Configuration options that will be used if they are not specified in the command line arguments.
|
@@ -400,4 +401,6 @@
|
|
400 |
"translation_num_beams": 2,
|
401 |
// Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
|
402 |
"translation_torch_dtype_float16": true,
|
|
|
|
|
403 |
}
|
|
|
59 |
"type": "huggingface",
|
60 |
"tokenizer_url": "facebook/m2m100_1.2B"
|
61 |
},
|
62 |
+
{
|
63 |
+
"name": "m2m100_1.2B/facebook",
|
64 |
+
"url": "facebook/m2m100_1.2B",
|
65 |
+
"type": "huggingface"
|
66 |
+
},
|
67 |
{
|
68 |
"name": "m2m100_418M-ct2fast/michaelfeil",
|
69 |
"url": "michaelfeil/ct2fast-m2m100_418M",
|
70 |
"type": "huggingface",
|
71 |
"tokenizer_url": "facebook/m2m100_418M"
|
72 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
{
|
74 |
+
"name": "m2m100_418M/facebook",
|
75 |
+
"url": "facebook/m2m100_418M",
|
76 |
"type": "huggingface"
|
77 |
},
|
78 |
{
|
79 |
+
"name": "m2m100-12B-last-ckpt/facebook",
|
80 |
+
"url": "facebook/m2m100-12B-last-ckpt",
|
81 |
"type": "huggingface"
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"name": "m2m100-12B-ct2fast/michaelfeil",
|
85 |
+
"url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
|
86 |
+
"type": "huggingface",
|
87 |
+
"tokenizer_url": "facebook/m2m100-12B-last-ckpt"
|
88 |
}
|
89 |
],
|
90 |
"nllb": [
|
91 |
+
{
|
92 |
+
"name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
|
93 |
+
"url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
|
94 |
+
"type": "huggingface",
|
95 |
+
"tokenizer_url": "facebook/nllb-200-distilled-1.3B"
|
96 |
+
},
|
97 |
{
|
98 |
"name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
|
99 |
"url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
|
|
|
101 |
"tokenizer_url": "facebook/nllb-200-distilled-1.3B"
|
102 |
},
|
103 |
{
|
104 |
+
"name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
|
105 |
+
"url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
|
106 |
"type": "huggingface",
|
107 |
+
"tokenizer_url": "facebook/nllb-200-distilled-1.3B"
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
|
111 |
+
"url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
|
112 |
+
"type": "huggingface",
|
113 |
+
"tokenizer_url": "facebook/nllb-200-distilled-1.3B"
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"name": "nllb-200-1.3B-ct2:int8/JustFrederik",
|
117 |
+
"url": "JustFrederik/nllb-200-1.3B-ct2-int8",
|
118 |
+
"type": "huggingface",
|
119 |
+
"tokenizer_url": "facebook/nllb-200-1.3B"
|
120 |
},
|
121 |
{
|
122 |
"name": "nllb-200-1.3B-ct2:float16/JustFrederik",
|
|
|
125 |
"tokenizer_url": "facebook/nllb-200-1.3B"
|
126 |
},
|
127 |
{
|
128 |
+
"name": "nllb-200-1.3B-ct2/JustFrederik",
|
129 |
+
"url": "JustFrederik/nllb-200-1.3B-ct2",
|
130 |
"type": "huggingface",
|
131 |
+
"tokenizer_url": "facebook/nllb-200-1.3B"
|
132 |
},
|
133 |
{
|
134 |
+
"name": "nllb-200-distilled-1.3B/facebook",
|
135 |
+
"url": "facebook/nllb-200-distilled-1.3B",
|
136 |
+
"type": "huggingface"
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"name": "nllb-200-1.3B/facebook",
|
140 |
+
"url": "facebook/nllb-200-1.3B",
|
141 |
+
"type": "huggingface"
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
|
145 |
+
"url": "michaelfeil/ct2fast-nllb-200-3.3B",
|
146 |
"type": "huggingface",
|
147 |
+
"tokenizer_url": "facebook/nllb-200-3.3B"
|
148 |
},
|
149 |
{
|
150 |
+
"name": "nllb-200-3.3B-ct2:float16/JustFrederik",
|
151 |
+
"url": "JustFrederik/nllb-200-3.3B-ct2-float16",
|
152 |
"type": "huggingface",
|
153 |
+
"tokenizer_url": "facebook/nllb-200-3.3B"
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"name": "nllb-200-3.3B/facebook",
|
157 |
+
"url": "facebook/nllb-200-3.3B",
|
158 |
+
"type": "huggingface"
|
159 |
},
|
160 |
{
|
161 |
"name": "nllb-200-distilled-600M/facebook",
|
|
|
163 |
"type": "huggingface"
|
164 |
},
|
165 |
{
|
166 |
+
"name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
|
167 |
+
"url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
|
168 |
"type": "huggingface",
|
169 |
"tokenizer_url": "facebook/nllb-200-distilled-600M"
|
170 |
},
|
|
|
175 |
"tokenizer_url": "facebook/nllb-200-distilled-600M"
|
176 |
},
|
177 |
{
|
178 |
+
"name": "nllb-200-distilled-600M-ct2/JustFrederik",
|
179 |
+
"url": "JustFrederik/nllb-200-distilled-600M-ct2",
|
180 |
"type": "huggingface",
|
181 |
"tokenizer_url": "facebook/nllb-200-distilled-600M"
|
182 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
],
|
184 |
"mt5": [
|
185 |
{
|
|
|
239 |
"name": "ALMA-13B/haoranxu",
|
240 |
"url": "haoranxu/ALMA-13B",
|
241 |
"type": "huggingface"
|
242 |
+
}
|
243 |
],
|
244 |
"madlad400": [
|
245 |
{
|
|
|
257 |
{
|
258 |
"name": "madlad400-3b-mt/jbochi",
|
259 |
"url": "jbochi/madlad400-3b-mt",
|
260 |
+
"type": "huggingface"
|
261 |
},
|
262 |
{
|
263 |
"name": "madlad400-7b-mt-bt/jbochi",
|
264 |
"url": "jbochi/madlad400-7b-mt-bt",
|
265 |
+
"type": "huggingface"
|
266 |
},
|
267 |
{
|
268 |
"name": "madlad400-10b-mt/jbochi",
|
269 |
"url": "jbochi/madlad400-10b-mt",
|
270 |
+
"type": "huggingface"
|
271 |
+
}
|
272 |
]
|
273 |
},
|
274 |
// Configuration options that will be used if they are not specified in the command line arguments.
|
|
|
401 |
"translation_num_beams": 2,
|
402 |
// Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
|
403 |
"translation_torch_dtype_float16": true,
|
404 |
+
// Translation - Using Bitsandbytes, Load the float32 translation model into mixed-8bit or 4bit precision quantized model(not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
|
405 |
+
"translation_using_bitsandbytes": null
|
406 |
}
|
docs/options.md
CHANGED
@@ -204,4 +204,8 @@ Beam size (1 for greedy search).
|
|
204 |
|
205 |
## Translation - Torch Dtype float16
|
206 |
- transformers: torch_dtype=torch.float16
|
207 |
-
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
## Translation - Torch Dtype float16
|
206 |
- transformers: torch_dtype=torch.float16
|
207 |
+
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
|
208 |
+
|
209 |
+
## Translation - Using Bitsandbytes
|
210 |
+
- transformers: load_in_8bit, load_in_4bit
|
211 |
+
Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
|
docs/translateModel.md
CHANGED
@@ -22,7 +22,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
|
|
22 |
|------|------------|------|---------------|---------------|
|
23 |
| [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
|
24 |
| [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
|
25 |
-
| [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 |
|
26 |
|
27 |
## M2M100-CTranslate2
|
28 |
|
@@ -133,18 +133,18 @@ madlad400 is a multilingual machine translation model based on the T5 architectu
|
|
133 |
|
134 |
## SeamlessM4T
|
135 |
|
136 |
-
SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
137 |
|
138 |
-
It enables multiple tasks without relying on separate models:
|
139 |
|
140 |
-
Speech-to-speech translation (S2ST)
|
141 |
-
Speech-to-text translation (S2TT)
|
142 |
-
Text-to-speech translation (T2ST)
|
143 |
-
Text-to-text translation (T2TT)
|
144 |
-
Automatic speech recognition (ASR)
|
145 |
|
146 |
-
SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
|
147 |
-
SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
|
148 |
|
149 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
150 |
|------|------------|------|---------------|---------------|
|
@@ -175,4 +175,8 @@ Beam size (1 for greedy search).
|
|
175 |
|
176 |
## Translation - Torch Dtype float16
|
177 |
- transformers: torch_dtype=torch.float16
|
178 |
-
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized
|
|
|
|
|
|
|
|
|
|
22 |
|------|------------|------|---------------|---------------|
|
23 |
| [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
|
24 |
| [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
|
25 |
+
| [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | 22.1 GB (torch dtype in float16) |
|
26 |
|
27 |
## M2M100-CTranslate2
|
28 |
|
|
|
133 |
|
134 |
## SeamlessM4T
|
135 |
|
136 |
+
SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
137 |
|
138 |
+
It enables multiple tasks without relying on separate models:
|
139 |
|
140 |
+
Speech-to-speech translation (S2ST)
|
141 |
+
Speech-to-text translation (S2TT)
|
142 |
+
Text-to-speech translation (T2ST)
|
143 |
+
Text-to-text translation (T2TT)
|
144 |
+
Automatic speech recognition (ASR)
|
145 |
|
146 |
+
SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
|
147 |
+
SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
|
148 |
|
149 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
150 |
|------|------------|------|---------------|---------------|
|
|
|
175 |
|
176 |
## Translation - Torch Dtype float16
|
177 |
- transformers: torch_dtype=torch.float16
|
178 |
+
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
|
179 |
+
|
180 |
+
## Translation - Using Bitsandbytes
|
181 |
+
- transformers: load_in_8bit, load_in_4bit
|
182 |
+
Load the float32 translation model into mixed-8bit or 4bit precision quantized model when the system supports GPU (reducing VRAM usage, not applicable to models that have already been quantized, such as Ctranslate2, GPTQ, GGUF)
|
requirements-fasterWhisper.txt
CHANGED
@@ -22,4 +22,7 @@ accelerate
|
|
22 |
auto-gptq
|
23 |
optimum
|
24 |
# Needed by ALMA-GGUL
|
25 |
-
ctransformers[cuda]
|
|
|
|
|
|
|
|
22 |
auto-gptq
|
23 |
optimum
|
24 |
# Needed by ALMA-GGUL
|
25 |
+
ctransformers[cuda]
|
26 |
+
# Needed by load_in_4bit parameters in transformers
|
27 |
+
bitsandbytes==0.41.2; platform_system != "Windows"
|
28 |
+
https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"
|
requirements-whisper.txt
CHANGED
@@ -21,4 +21,7 @@ accelerate
|
|
21 |
auto-gptq
|
22 |
optimum
|
23 |
# Needed by ALMA-GGUL
|
24 |
-
ctransformers[cuda]
|
|
|
|
|
|
|
|
21 |
auto-gptq
|
22 |
optimum
|
23 |
# Needed by ALMA-GGUL
|
24 |
+
ctransformers[cuda]
|
25 |
+
# Needed by load_in_4bit parameters in transformers
|
26 |
+
bitsandbytes==0.41.2; platform_system != "Windows"
|
27 |
+
https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"
|
requirements.txt
CHANGED
@@ -22,4 +22,7 @@ accelerate
|
|
22 |
auto-gptq
|
23 |
optimum
|
24 |
# Needed by ALMA-GGUL
|
25 |
-
ctransformers[cuda]
|
|
|
|
|
|
|
|
22 |
auto-gptq
|
23 |
optimum
|
24 |
# Needed by ALMA-GGUL
|
25 |
+
ctransformers[cuda]
|
26 |
+
# Needed by load_in_4bit parameters in transformers
|
27 |
+
bitsandbytes==0.41.2; platform_system != "Windows"
|
28 |
+
https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl; platform_system == "Windows"
|
src/config.py
CHANGED
@@ -83,6 +83,7 @@ class ApplicationConfig:
|
|
83 |
translation_no_repeat_ngram_size: int = 3,
|
84 |
translation_num_beams: int = 2,
|
85 |
translation_torch_dtype_float16: bool = True,
|
|
|
86 |
# Whisper Segments Filter
|
87 |
whisper_segments_filter: bool = False,
|
88 |
whisper_segments_filters: List[str] = [],
|
@@ -152,6 +153,7 @@ class ApplicationConfig:
|
|
152 |
self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
|
153 |
self.translation_num_beams = translation_num_beams
|
154 |
self.translation_torch_dtype_float16 = translation_torch_dtype_float16
|
|
|
155 |
# Whisper Segments Filter
|
156 |
self.whisper_segments_filter = whisper_segments_filter
|
157 |
self.whisper_segments_filters = whisper_segments_filters
|
|
|
83 |
translation_no_repeat_ngram_size: int = 3,
|
84 |
translation_num_beams: int = 2,
|
85 |
translation_torch_dtype_float16: bool = True,
|
86 |
+
translation_using_bitsandbytes: str = None,
|
87 |
# Whisper Segments Filter
|
88 |
whisper_segments_filter: bool = False,
|
89 |
whisper_segments_filters: List[str] = [],
|
|
|
153 |
self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
|
154 |
self.translation_num_beams = translation_num_beams
|
155 |
self.translation_torch_dtype_float16 = translation_torch_dtype_float16
|
156 |
+
self.translation_using_bitsandbytes = translation_using_bitsandbytes
|
157 |
# Whisper Segments Filter
|
158 |
self.whisper_segments_filter = whisper_segments_filter
|
159 |
self.whisper_segments_filters = whisper_segments_filters
|
src/translation/translationModel.py
CHANGED
@@ -22,6 +22,7 @@ class TranslationModel:
|
|
22 |
noRepeatNgramSize: int = 3,
|
23 |
numBeams: int = 2,
|
24 |
torchDtypeFloat16: bool = True,
|
|
|
25 |
downloadRoot: Optional[str] = None,
|
26 |
localFilesOnly: bool = False,
|
27 |
loadModel: bool = False,
|
@@ -73,7 +74,14 @@ class TranslationModel:
|
|
73 |
)
|
74 |
|
75 |
if device is None:
|
|
|
76 |
if torch.cuda.is_available():
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
device = "cuda" if "ct2" in self.modelPath else "cuda:0"
|
78 |
else:
|
79 |
device = "cpu"
|
@@ -81,12 +89,30 @@ class TranslationModel:
|
|
81 |
|
82 |
self.device = device
|
83 |
self.torchDtypeFloat16 = torchDtypeFloat16
|
|
|
84 |
|
85 |
if loadModel:
|
86 |
self.load_model()
|
87 |
|
88 |
def load_model(self):
|
89 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
[from_pretrained]
|
91 |
low_cpu_mem_usage(bool, optional):
|
92 |
Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
|
@@ -172,64 +198,96 @@ class TranslationModel:
|
|
172 |
"""
|
173 |
try:
|
174 |
print('\n\nLoading model: %s\n\n' % self.modelPath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
if "ct2" in self.modelPath:
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
177 |
if "nllb" in self.modelPath:
|
178 |
-
|
179 |
self.targetPrefix = [self.translationLang.nllb.code]
|
180 |
elif "m2m100" in self.modelPath:
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
self.
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
|
190 |
-
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.m2m100.code)
|
191 |
-
self.transModel = ctranslate2.Translator(self.modelPath, compute_type="auto", device=self.device)
|
192 |
elif "mt5" in self.modelPath:
|
193 |
self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
|
194 |
-
|
195 |
-
self.
|
196 |
-
self.
|
|
|
197 |
elif "ALMA" in self.modelPath:
|
198 |
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
199 |
-
if "
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
|
205 |
-
transModelConfig.quantization_config["use_exllama"] = False
|
206 |
-
self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision, config=transModelConfig, torch_dtype=torch.float32)
|
207 |
-
else:
|
208 |
-
# transModelConfig.quantization_config["exllama_config"] = {"version":2} # After configuring to use ExLlamaV2, VRAM cannot be effectively released, which may be an issue. Temporarily not adopting the V2 version.
|
209 |
-
self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision)
|
210 |
-
elif "GGUF" in self.modelPath:
|
211 |
import ctransformers
|
212 |
-
self.
|
213 |
-
if self.device == "cpu":
|
214 |
-
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, low_cpu_mem_usage=True)
|
215 |
-
else:
|
216 |
-
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50, low_cpu_mem_usage=True)
|
217 |
else:
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
elif "madlad400" in self.modelPath:
|
222 |
self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
|
223 |
-
|
224 |
-
self.
|
225 |
-
self.
|
|
|
226 |
else:
|
227 |
-
|
228 |
-
self.
|
|
|
|
|
229 |
if "m2m100" in self.modelPath:
|
230 |
-
|
231 |
else: #NLLB
|
232 |
-
|
|
|
|
|
233 |
except Exception as e:
|
234 |
self.release_vram()
|
235 |
raise e
|
|
|
22 |
noRepeatNgramSize: int = 3,
|
23 |
numBeams: int = 2,
|
24 |
torchDtypeFloat16: bool = True,
|
25 |
+
usingBitsandbytes: str = None,
|
26 |
downloadRoot: Optional[str] = None,
|
27 |
localFilesOnly: bool = False,
|
28 |
loadModel: bool = False,
|
|
|
74 |
)
|
75 |
|
76 |
if device is None:
|
77 |
+
self.totalVram = 0
|
78 |
if torch.cuda.is_available():
|
79 |
+
try:
|
80 |
+
deviceId = torch.cuda.current_device()
|
81 |
+
self.totalVram = torch.cuda.get_device_properties(deviceId).total_memory/(1024*1024*1024)
|
82 |
+
except Exception as e:
|
83 |
+
print(traceback.format_exc())
|
84 |
+
print("Error detect vram: " + str(e))
|
85 |
device = "cuda" if "ct2" in self.modelPath else "cuda:0"
|
86 |
else:
|
87 |
device = "cpu"
|
|
|
89 |
|
90 |
self.device = device
|
91 |
self.torchDtypeFloat16 = torchDtypeFloat16
|
92 |
+
self.usingBitsandbytes = usingBitsandbytes
|
93 |
|
94 |
if loadModel:
|
95 |
self.load_model()
|
96 |
|
97 |
def load_model(self):
|
98 |
"""
|
99 |
+
[transformers.BitsAndBytesConfig]
|
100 |
+
load_in_8bit (bool, optional, defaults to False)
|
101 |
+
This flag is used to enable 8-bit quantization with LLM.int8().
|
102 |
+
load_in_4bit (bool, optional, defaults to False)
|
103 |
+
This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from bitsandbytes.
|
104 |
+
llm_int8_enable_fp32_cpu_offload (bool, optional, defaults to False)
|
105 |
+
This flag is used for advanced use cases and users that are aware of this feature.
|
106 |
+
If you want to split your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use this flag.
|
107 |
+
This is useful for offloading large models such as google/flan-t5-xxl. Note that the int8 operations will not be run on CPU.
|
108 |
+
bnb_4bit_compute_dtype (torch.dtype or str, optional, defaults to torch.float32)
|
109 |
+
This sets the computational type which might be different than the input time.
|
110 |
+
For example, inputs might be fp32, but computation can be set to bf16 for speedups.
|
111 |
+
bnb_4bit_quant_type (str, optional, defaults to "fp4")
|
112 |
+
This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types which are specified by fp4 or nf4.
|
113 |
+
bnb_4bit_use_double_quant (bool, optional, defaults to False)
|
114 |
+
This flag is used for nested quantization where the quantization constants from the first quantization are quantized again.
|
115 |
+
|
116 |
[from_pretrained]
|
117 |
low_cpu_mem_usage(bool, optional):
|
118 |
Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
|
|
|
198 |
"""
|
199 |
try:
|
200 |
print('\n\nLoading model: %s\n\n' % self.modelPath)
|
201 |
+
kwargsTokenizer = {}
|
202 |
+
kwargsModel = {}
|
203 |
+
kwargsPipeline = {}
|
204 |
+
|
205 |
+
if not any(name in self.modelPath for name in ["ct2", "GGUF", "GPTQ"]):
|
206 |
+
kwargsModel["torch_dtype"] = torch.float16 if self.torchDtypeFloat16 else "auto"
|
207 |
+
|
208 |
+
if "GPTQ" in self.modelPath:
|
209 |
+
kwargsModel.update({"device_map": "auto"})
|
210 |
+
elif "ct2" in self.modelPath:
|
211 |
+
kwargsModel.update({"device": self.device})
|
212 |
+
elif "GGUF" in self.modelPath:
|
213 |
+
pass
|
214 |
+
elif self.usingBitsandbytes == None:
|
215 |
+
kwargsPipeline.update({"device": self.device})
|
216 |
+
elif self.usingBitsandbytes == "int8":
|
217 |
+
kwargsModel.update({"load_in_8bit": True, "llm_int8_enable_fp32_cpu_offload": True})
|
218 |
+
elif self.usingBitsandbytes == "int4":
|
219 |
+
kwargsModel.update({"load_in_4bit": True, "llm_int8_enable_fp32_cpu_offload": True,
|
220 |
+
"bnb_4bit_use_double_quant": True,
|
221 |
+
"bnb_4bit_quant_type": "nf4",
|
222 |
+
"bnb_4bit_compute_dtype": torch.bfloat16})
|
223 |
+
|
224 |
+
if not any(name in self.modelPath for name in ["ct2", "GGUF"]):
|
225 |
+
kwargsModel.update({"pretrained_model_name_or_path": self.modelPath, "low_cpu_mem_usage": True})
|
226 |
+
|
227 |
if "ct2" in self.modelPath:
|
228 |
+
kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath})
|
229 |
+
kwargsModel.update({"model_path": self.modelPath, "compute_type": "auto"})
|
230 |
+
if "ALMA" in self.modelPath:
|
231 |
+
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
232 |
+
self.transModel = ctranslate2.Generator(**kwargsModel)
|
233 |
+
else:
|
234 |
if "nllb" in self.modelPath:
|
235 |
+
kwargsTokenizer.update({"src_lang": self.whisperLang.nllb.code})
|
236 |
self.targetPrefix = [self.translationLang.nllb.code]
|
237 |
elif "m2m100" in self.modelPath:
|
238 |
+
kwargsTokenizer.update({"src_lang": self.whisperLang.m2m100.code})
|
239 |
+
elif "madlad400" in self.modelPath:
|
240 |
+
kwargsTokenizer.update({"src_lang": self.whisperLang.m2m100.code})
|
241 |
+
self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
|
242 |
+
self.transModel = ctranslate2.Translator(**kwargsModel)
|
243 |
+
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
|
244 |
+
if "m2m100" in self.modelPath:
|
245 |
+
self.targetPrefix = [self.transTokenizer.lang_code_to_token[self.translationLang.m2m100.code]]
|
|
|
|
|
|
|
246 |
elif "mt5" in self.modelPath:
|
247 |
self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
|
248 |
+
kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "legacy": False})
|
249 |
+
self.transTokenizer = transformers.T5Tokenizer.from_pretrained(**kwargsTokenizer)
|
250 |
+
self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(**kwargsModel)
|
251 |
+
kwargsPipeline.update({"task": "text2text-generation", "model": self.transModel, "tokenizer": self.transTokenizer})
|
252 |
elif "ALMA" in self.modelPath:
|
253 |
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
254 |
+
if "GGUF" in self.modelPath:
|
255 |
+
kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelConfig.tokenizer_url})
|
256 |
+
kwargsModel.update({"model_path_or_repo_id": self.modelPath, "hf": True, "model_file": self.modelConfig.model_file, "model_type": "llama"})
|
257 |
+
if self.totalVram > 2:
|
258 |
+
kwargsModel.update({"gpu_layers":int(self.totalVram*7)})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
import ctransformers
|
260 |
+
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(**kwargsModel)
|
|
|
|
|
|
|
|
|
261 |
else:
|
262 |
+
kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "use_fast": True})
|
263 |
+
if "GPTQ" in self.modelPath:
|
264 |
+
kwargsModel.update({"trust_remote_code": False, "revision": self.modelConfig.revision})
|
265 |
+
if self.device == "cpu":
|
266 |
+
# Due to the poor support of GPTQ for CPUs, Therefore, it is strongly discouraged to operate it on CPU.
|
267 |
+
# set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
|
268 |
+
transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
|
269 |
+
transModelConfig.quantization_config["use_exllama"] = False
|
270 |
+
kwargsModel.update({"config": transModelConfig})
|
271 |
+
self.transModel = transformers.AutoModelForCausalLM.from_pretrained(**kwargsModel)
|
272 |
+
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
|
273 |
+
kwargsPipeline.update({"task": "text-generation", "model": self.transModel, "tokenizer": self.transTokenizer, "do_sample": True, "temperature": 0.7, "top_k": 40, "top_p": 0.95, "repetition_penalty": 1.1})
|
274 |
elif "madlad400" in self.modelPath:
|
275 |
self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
|
276 |
+
kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath, "legacy": False})
|
277 |
+
self.transTokenizer = transformers.T5Tokenizer.from_pretrained(**kwargsTokenizer)
|
278 |
+
self.transModel = transformers.T5ForConditionalGeneration.from_pretrained(**kwargsModel)
|
279 |
+
kwargsPipeline.update({"task": "text2text-generation", "model": self.transModel, "tokenizer": self.transTokenizer})
|
280 |
else:
|
281 |
+
kwargsTokenizer.update({"pretrained_model_name_or_path": self.modelPath})
|
282 |
+
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(**kwargsTokenizer)
|
283 |
+
self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(**kwargsModel)
|
284 |
+
kwargsPipeline.update({"task": "translation", "model": self.transModel, "tokenizer": self.transTokenizer})
|
285 |
if "m2m100" in self.modelPath:
|
286 |
+
kwargsPipeline.update({"src_lang": self.whisperLang.m2m100.code, "tgt_lang": self.translationLang.m2m100.code})
|
287 |
else: #NLLB
|
288 |
+
kwargsPipeline.update({"src_lang": self.whisperLang.nllb.code, "tgt_lang": self.translationLang.nllb.code})
|
289 |
+
if "ct2" not in self.modelPath:
|
290 |
+
self.transTranslator = transformers.pipeline(**kwargsPipeline)
|
291 |
except Exception as e:
|
292 |
self.release_vram()
|
293 |
raise e
|