datatab commited on
Commit
ae39f40
1 Parent(s): 12a7a9b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +31 -0
README.md CHANGED
@@ -15,3 +15,34 @@ base_model: gordicaleksa/YugoGPT
15
  - **Developed by:** datatab
16
  - **License:** apache-2.0
17
  - **Finetuned from model :** gordicaleksa/YugoGPT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  - **Developed by:** datatab
16
  - **License:** apache-2.0
17
  - **Finetuned from model :** gordicaleksa/YugoGPT
18
+
19
+
20
+ # Quant. preference
21
+
22
+ ```bash
23
+ "not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
24
+ "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
25
+ "quantized" : "Recommended. Slow conversion. Fast inference, small files.",
26
+ "f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
27
+ "f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
28
+ "q8_0" : "Fast conversion. High resource use, but generally acceptable.",
29
+ "q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
30
+ "q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
31
+ "q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
32
+ "q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
33
+ "q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
34
+ "q3_k_s" : "Uses Q3_K for all tensors",
35
+ "q4_0" : "Original quant method, 4-bit.",
36
+ "q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
37
+ "q4_k_s" : "Uses Q4_K for all tensors",
38
+ "q4_k" : "alias for q4_k_m",
39
+ "q5_k" : "alias for q5_k_m",
40
+ "q5_0" : "Higher accuracy, higher resource usage and slower inference.",
41
+ "q5_1" : "Even higher accuracy, resource usage and slower inference.",
42
+ "q5_k_s" : "Uses Q5_K for all tensors",
43
+ "q6_k" : "Uses Q8_K for all tensors",
44
+ "iq2_xxs" : "2.06 bpw quantization",
45
+ "iq2_xs" : "2.31 bpw quantization",
46
+ "iq3_xxs" : "3.06 bpw quantization",
47
+ "q3_k_xs" : "3-bit extra small quantization"
48
+ ```