kaleinaNyan
commited on
Commit
•
f0d4174
1
Parent(s):
74f06ca
feat: add ranking board
Browse files
README.md
CHANGED
@@ -98,4 +98,58 @@ judgement_map = {
|
|
98 |
}
|
99 |
|
100 |
print(judgement_map[judgement])
|
101 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
}
|
99 |
|
100 |
print(judgement_map[judgement])
|
101 |
+
```
|
102 |
+
|
103 |
+
---
|
104 |
+
|
105 |
+
### **Generated ranking**
|
106 |
+
|
107 |
+
The ranking was obtained using a modified [Russian LLM Arena code](https://github.com/VikhrModels/ru_llm_arena).
|
108 |
+
All judgements were regenerated using the jina-judge model. It takes about 16 minutes to regenerate the whole board (or 23 seconds per model) on an RTX3090.
|
109 |
+
|
110 |
+
|
111 |
+
| Model | Score | 95% CI | Average #Tokens |
|
112 |
+
|--------------------------------------------------|-------|----------------------|-----------------|
|
113 |
+
| gpt-4-1106-preview | 82.8 | (-2.2, 2.3) | 541 |
|
114 |
+
| gpt-4o-mini | 75.3 | (-2.5, 2.9) | 448 |
|
115 |
+
| qwen-2.5-72b-it | 73.1 | (-3.4, 3.1) | 557 |
|
116 |
+
| gemma-2-9b-it-sppo-iter3 | 70.6 | (-3.9, 2.8) | 509 |
|
117 |
+
| gemma-2-27b-it | 68.7 | (-2.8, 3.8) | 472 |
|
118 |
+
| t-lite-instruct-0.1 | 67.5 | (-3.8, 3.8) | 810 |
|
119 |
+
| gemma-2-9b-it | 67.0 | (-3.7, 3.3) | 459 |
|
120 |
+
| suzume-llama-3-8B-multilingual-orpo-borda-half | 62.4 | (-3.5, 3.7) | 682 |
|
121 |
+
| glm-4-9b-chat | 61.5 | (-3.7, 3.0) | 568 |
|
122 |
+
| phi-3-medium-4k-instruct | 60.4 | (-3.5, 3.7) | 566 |
|
123 |
+
| sfr-iterative-dpo-llama-3-8b-r | 57.2 | (-3.9, 2.2) | 516 |
|
124 |
+
| c4ai-command-r-v01 | 55.0 | (-3.9, 3.1) | 529 |
|
125 |
+
| suzume-llama-3-8b-multilingual | 51.9 | (-2.8, 3.7) | 641 |
|
126 |
+
| mistral-nemo-instruct-2407 | 51.9 | (-3.8, 3.7) | 403 |
|
127 |
+
| yandex_gpt_pro | 50.3 | (-3.4, 3.1) | 345 |
|
128 |
+
| gpt-3.5-turbo-0125 | 50.0 | (0.0, 0.0) | 220 |
|
129 |
+
| hermes-2-theta-llama-3-8b | 49.3 | (-3.4, 3.9) | 485 |
|
130 |
+
| starling-lm-7b-beta | 48.3 | (-3.8, 4.0) | 629 |
|
131 |
+
| llama-3-8b-saiga-suzume-ties | 47.9 | (-3.9, 5.0) | 763 |
|
132 |
+
| llama-3-smaug-8b | 47.6 | (-3.6, 3.1) | 524 |
|
133 |
+
| vikhr-it-5.4-fp16-orpo-v2 | 46.8 | (-2.5, 2.7) | 379 |
|
134 |
+
| aya-23-8b | 46.1 | (-3.9, 3.9) | 554 |
|
135 |
+
| saiga_llama3_8b_v6 | 44.8 | (-3.4, 3.3) | 471 |
|
136 |
+
| qwen2-7b-instruct | 43.6 | (-3.0, 2.7) | 340 |
|
137 |
+
| vikhr-it-5.2-fp16-cp | 43.6 | (-4.1, 3.3) | 543 |
|
138 |
+
| openchat-3.5-0106 | 42.8 | (-3.9, 3.3) | 492 |
|
139 |
+
| kolibri-mistral-0427-upd | 42.3 | (-4.2, 3.2) | 551 |
|
140 |
+
| paralex-llama-3-8b-sft | 41.8 | (-3.2, 3.7) | 688 |
|
141 |
+
| llama-3-instruct-8b-sppo-iter3 | 41.7 | (-3.4, 3.3) | 502 |
|
142 |
+
| gpt-3.5-turbo-1106 | 41.5 | (-2.9, 2.1) | 191 |
|
143 |
+
| mistral-7b-instruct-v0.3 | 41.1 | (-4.3, 3.5) | 469 |
|
144 |
+
| gigachat_pro | 40.9 | (-3.4, 3.6) | 294 |
|
145 |
+
| openchat-3.6-8b-20240522 | 39.1 | (-3.2, 4.1) | 428 |
|
146 |
+
| vikhr-it-5.3-fp16-32k | 38.8 | (-3.5, 3.3) | 519 |
|
147 |
+
| hermes-2-pro-llama-3-8b | 38.4 | (-3.2, 3.1) | 463 |
|
148 |
+
| kolibri-vikhr-mistral-0427 | 34.5 | (-2.9, 3.5) | 489 |
|
149 |
+
| vikhr-it-5.3-fp16 | 33.5 | (-3.5, 3.8) | 523 |
|
150 |
+
| llama-3-instruct-8b-simpo | 32.7 | (-3.9, 3.6) | 417 |
|
151 |
+
| meta-llama-3-8b-instruct | 32.1 | (-3.4, 3.3) | 450 |
|
152 |
+
| neural-chat-7b-v3-3 | 25.9 | (-2.7, 3.6) | 927 |
|
153 |
+
| gigachat_lite | 25.4 | (-2.8, 2.5) | 276 |
|
154 |
+
| snorkel-mistral-pairrm-dpo | 10.3 | (-2.0, 2.3) | 773 |
|
155 |
+
| storm-7b | 3.7 | (-1.3, 1.6) | 419 |
|