weichiang commited on
Commit
2bba63b
1 Parent(s): ed12ee3

Arena-Hard-Auto Leaderboard UI (#51)

Browse files

- add new org info (b07b84586afab546df88aa91706437955dfcb1e4)
- add arena hard leaderboard data (c89d0277b82863ad50b242253baeab8fbc83ad51)
- rename (44194cadcd623b0f93cbd2237d175473b6412d1b)

arena_hard_auto_leaderboard_v0.1.csv ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,score,rating_q025,rating_q975,CI,avg_tokens,date
2
+ gpt-4-turbo-2024-04-09,82.63,80.75,84.6,"(1.9, 2.0)",662.0,2024-07-31
3
+ claude-3-5-sonnet-20240620,79.35,77.25,80.62,"(2.1, 1.3)",567.0,2024-07-31
4
+ gpt-4o-2024-05-13,79.21,77.42,80.71,"(1.8, 1.5)",696.0,2024-07-31
5
+ gpt-4-0125-preview,77.96,75.94,79.9,"(2.0, 1.9)",619.0,2024-07-31
6
+ athene-70b-0725,76.83,74.84,78.74,"(2.0, 1.9)",683.0,2024-07-31
7
+ gpt-4o-mini-2024-07-18,74.94,72.66,77.07,"(2.3, 2.1)",668.0,2024-07-31
8
+ gemini-1.5-pro-api-0514,71.96,69.62,74.62,"(2.3, 2.7)",676.0,2024-07-31
9
+ yi-large-preview,71.48,69.02,73.37,"(2.5, 1.9)",720.0,2024-07-31
10
+ mistral-large-2407,70.42,68.11,72.43,"(2.3, 2.0)",623.0,2024-07-31
11
+ llama-3.1-405b-instruct,64.09,61.43,66.55,"(2.7, 2.5)",633.0,2024-07-31
12
+ glm-4-0520,63.84,61.28,66.19,"(2.6, 2.3)",636.0,2024-07-31
13
+ yi-large,63.7,61.76,65.86,"(1.9, 2.2)",626.0,2024-07-31
14
+ deepseek-coder-v2,62.3,59.82,64.72,"(2.5, 2.4)",578.0,2024-07-31
15
+ claude-3-opus-20240229,60.36,57.56,62.34,"(2.8, 2.0)",541.0,2024-07-31
16
+ gemma-2-27b-it,57.51,55.11,60.12,"(2.4, 2.6)",577.0,2024-07-31
17
+ llama-3.1-70b-instruct,55.73,52.85,58.2,"(2.9, 2.5)",628.0,2024-07-31
18
+ glm-4-0116,55.72,53.83,58.16,"(1.9, 2.4)",622.0,2024-07-31
19
+ gemini-1.5-pro-api-0409-preview,53.37,51.13,56.66,"(2.2, 3.3)",478.0,2024-07-31
20
+ glm-4-air,50.88,48.62,53.21,"(2.3, 2.3)",619.0,2024-07-31
21
+ gpt-4-0314,50.0,50.0,50.0,"(0.0, 0.0)",423.0,2024-07-31
22
+ gemini-1.5-flash-api-0514,49.61,47.46,52.17,"(2.1, 2.6)",642.0,2024-07-31
23
+ qwen2-72b-instruct,46.86,44.57,49.29,"(2.3, 2.4)",515.0,2024-07-31
24
+ claude-3-sonnet-20240229,46.8,44.12,49.04,"(2.7, 2.2)",552.0,2024-07-31
25
+ llama-3-70b-instruct,46.57,43.84,49.18,"(2.7, 2.6)",591.0,2024-07-31
26
+ claude-3-haiku-20240307,41.47,39.57,44.02,"(1.9, 2.6)",505.0,2024-07-31
27
+ gpt-4-0613,37.9,35.6,40.36,"(2.3, 2.5)",354.0,2024-07-31
28
+ mistral-large-2402,37.71,34.81,39.77,"(2.9, 2.1)",400.0,2024-07-31
29
+ mixtral-8x22b-instruct-v0.1,36.36,34.21,38.55,"(2.1, 2.2)",430.0,2024-07-31
30
+ qwen1.5-72b-chat,36.12,33.88,38.15,"(2.2, 2.0)",474.0,2024-07-31
31
+ phi-3-medium-4k-instruct,33.37,31.26,35.14,"(2.1, 1.8)",517.0,2024-07-31
32
+ command-r-plus,33.07,30.85,35.12,"(2.2, 2.0)",541.0,2024-07-31
33
+ mistral-medium,31.9,29.66,34.31,"(2.2, 2.4)",485.0,2024-07-31
34
+ phi-3-small-8k-instruct,29.77,27.94,31.97,"(1.8, 2.2)",568.0,2024-07-31
35
+ mistral-next,27.37,25.4,29.09,"(2.0, 1.7)",297.0,2024-07-31
36
+ gpt-3.5-turbo-0613,24.82,22.54,26.29,"(2.3, 1.5)",401.0,2024-07-31
37
+ dbrx-instruct-preview,24.63,22.33,26.83,"(2.3, 2.2)",415.0,2024-07-31
38
+ claude-2.0,23.99,21.71,25.65,"(2.3, 1.7)",295.0,2024-07-31
39
+ mixtral-8x7b-instruct-v0.1,23.4,21.38,25.41,"(2.0, 2.0)",457.0,2024-07-31
40
+ gpt-3.5-turbo-0125,23.34,21.67,25.27,"(1.7, 1.9)",329.0,2024-07-31
41
+ yi-34b-chat,23.15,20.75,24.7,"(2.4, 1.6)",611.0,2024-07-31
42
+ starling-lm-7b-beta,23.01,20.81,24.66,"(2.2, 1.6)",530.0,2024-07-31
43
+ claude-2.1,22.77,20.65,25.43,"(2.1, 2.7)",290.0,2024-07-31
44
+ llama-3.1-8b-instruct,21.34,19.71,23.09,"(1.6, 1.8)",861.0,2024-07-31
45
+ snorkel-mistral-pairrm-dpo,20.73,19.04,22.05,"(1.7, 1.3)",564.0,2024-07-31
46
+ llama-3-8b-instruct,20.56,18.82,22.61,"(1.7, 2.1)",585.0,2024-07-31
47
+ gpt-3.5-turbo-1106,18.87,17.06,20.58,"(1.8, 1.7)",285.0,2024-07-31
48
+ gpt-3.5-turbo-0314,18.05,16.57,20.06,"(1.5, 2.0)",334.0,2024-07-31
49
+ gemini-pro,17.8,15.96,19.32,"(1.8, 1.5)",322.0,2024-07-31
50
+ snowflake-arctic-instruct,17.61,16.12,19.27,"(1.5, 1.7)",365.0,2024-07-31
51
+ command-r,17.02,15.73,18.51,"(1.3, 1.5)",432.0,2024-07-31
52
+ phi-3-mini-128k-instruct,15.43,13.94,17.02,"(1.5, 1.6)",609.0,2024-07-31
53
+ tulu-2-dpo-70b,14.99,13.05,16.82,"(1.9, 1.8)",550.0,2024-07-31
54
+ starling-lm-7b-alpha,12.8,11.23,14.5,"(1.6, 1.7)",483.0,2024-07-31
55
+ mistral-7b-instruct,12.57,11.05,14.11,"(1.5, 1.5)",541.0,2024-07-31
56
+ gemma-1.1-7b-it,12.09,10.61,13.43,"(1.5, 1.3)",341.0,2024-07-31
57
+ llama-2-70b-chat,11.55,10.02,13.01,"(1.5, 1.5)",595.0,2024-07-31
58
+ vicuna-33b,8.63,7.59,9.84,"(1.0, 1.2)",451.0,2024-07-31
59
+ gemma-7b-it,7.47,6.5,8.6,"(1.0, 1.1)",378.0,2024-07-31
60
+ gemma-1.1-2b-it,3.37,2.74,4.14,"(0.6, 0.8)",316.0,2024-07-31
61
+ gemma-2b-it,3.0,2.33,3.67,"(0.7, 0.7)",369.0,2024-07-31
leaderboard_table_20240731.csv CHANGED
@@ -139,3 +139,6 @@ llama-3.1-8b-instruct,Meta-Llama-3.1-8b-Instruct,-,0.730,2023/12,Llama 3.1 Commu
139
  athene-70b-0725,Athene-70b,-,-,2024/7,CC-BY-NC-4.0,NexusFlow,https://huggingface.co/Nexusflow/Athene-70B
140
  internvl2-26b,InternVL2-26b,-,-,2024/7,MIT,OpenGVLab,https://internvl.github.io/blog/2024-07-02-InternVL-2.0/
141
  gemma-2-2b-it,Gemma-2-2B-it,-,0.513,2024/7,Gemma license,Google,https://ai.google.dev/gemma#introducing-gemma-2
 
 
 
 
139
  athene-70b-0725,Athene-70b,-,-,2024/7,CC-BY-NC-4.0,NexusFlow,https://huggingface.co/Nexusflow/Athene-70B
140
  internvl2-26b,InternVL2-26b,-,-,2024/7,MIT,OpenGVLab,https://internvl.github.io/blog/2024-07-02-InternVL-2.0/
141
  gemma-2-2b-it,Gemma-2-2B-it,-,0.513,2024/7,Gemma license,Google,https://ai.google.dev/gemma#introducing-gemma-2
142
+ glm-4-air,GLM-4-AIR,-,-,Unknown,Proprietary,Zhipu AI,https://open.bigmodel.cn/
143
+ snorkel-mistral-pairrm-dpo,Snorkel-Mistral-PairRM-DPO,-,-,2024/5,Apache 2.0,Snorkel AI,https://huggingface.co/snorkelai/Snorkel-Mistral-PairRM-DPO
144
+ mistral-large-2407,Mistral-Large-2407,-,-,2024/7,Proprietary,Mistral,https://mistral.ai/news/mistral-large-2407/