Update README.md
Browse files
README.md
CHANGED
@@ -15,3 +15,34 @@ base_model: gordicaleksa/YugoGPT
|
|
15 |
- **Developed by:** datatab
|
16 |
- **License:** apache-2.0
|
17 |
- **Finetuned from model :** gordicaleksa/YugoGPT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
- **Developed by:** datatab
|
16 |
- **License:** apache-2.0
|
17 |
- **Finetuned from model :** gordicaleksa/YugoGPT
|
18 |
+
|
19 |
+
|
20 |
+
# Quant. preference
|
21 |
+
|
22 |
+
```bash
|
23 |
+
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
|
24 |
+
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
|
25 |
+
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
|
26 |
+
"f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
|
27 |
+
"f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
|
28 |
+
"q8_0" : "Fast conversion. High resource use, but generally acceptable.",
|
29 |
+
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
|
30 |
+
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
|
31 |
+
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
|
32 |
+
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
|
33 |
+
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
|
34 |
+
"q3_k_s" : "Uses Q3_K for all tensors",
|
35 |
+
"q4_0" : "Original quant method, 4-bit.",
|
36 |
+
"q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
|
37 |
+
"q4_k_s" : "Uses Q4_K for all tensors",
|
38 |
+
"q4_k" : "alias for q4_k_m",
|
39 |
+
"q5_k" : "alias for q5_k_m",
|
40 |
+
"q5_0" : "Higher accuracy, higher resource usage and slower inference.",
|
41 |
+
"q5_1" : "Even higher accuracy, resource usage and slower inference.",
|
42 |
+
"q5_k_s" : "Uses Q5_K for all tensors",
|
43 |
+
"q6_k" : "Uses Q8_K for all tensors",
|
44 |
+
"iq2_xxs" : "2.06 bpw quantization",
|
45 |
+
"iq2_xs" : "2.31 bpw quantization",
|
46 |
+
"iq3_xxs" : "3.06 bpw quantization",
|
47 |
+
"q3_k_xs" : "3-bit extra small quantization"
|
48 |
+
```
|