andrewrreed HF staff commited on
Commit
167137b
1 Parent(s): 311dc3a

Add filters

Browse files
Files changed (4) hide show
  1. app.py +157 -101
  2. release_date_mapping.json +1 -1
  3. requirements.txt +2 -1
  4. utils.py +60 -0
app.py CHANGED
@@ -2,113 +2,169 @@ import pickle
2
 
3
  import pandas as pd
4
  import gradio as gr
5
- from huggingface_hub import HfFileSystem, hf_hub_download
6
-
7
- if gr.NO_RELOAD:
8
- ###################
9
- ### Load Data
10
- ###################
11
-
12
- key_to_category_name = {
13
- "full": "Overall",
14
- "coding": "Coding",
15
- "long_user": "Longer Query",
16
- "english": "English",
17
- "chinese": "Chinese",
18
- "french": "French",
19
- "no_tie": "Exclude Ties",
20
- "no_short": "Exclude Short Query (< 5 tokens)",
21
- "no_refusal": "Exclude Refusal",
22
- }
23
- cat_name_to_explanation = {
24
- "Overall": "Overall Questions",
25
- "Coding": "Coding: whether conversation contains code snippets",
26
- "Longer Query": "Longer Query (>= 500 tokens)",
27
- "English": "English Prompts",
28
- "Chinese": "Chinese Prompts",
29
- "French": "French Prompts",
30
- "Exclude Ties": "Exclude Ties and Bothbad",
31
- "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
32
- "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
33
- }
34
-
35
- fs = HfFileSystem()
36
-
37
- def extract_date(filename):
38
- return filename.split("/")[-1].split(".")[0].split("_")[-1]
39
-
40
- # gather ELO data
41
- ELO_DATA_FILES = "spaces/lmsys/chatbot-arena-leaderboard/*.pkl"
42
- elo_files = fs.glob(ELO_DATA_FILES)
43
- latest_elo_file = sorted(elo_files, key=extract_date, reverse=True)[0]
44
-
45
- latest_elo_file_local = hf_hub_download(
46
- repo_id="lmsys/chatbot-arena-leaderboard",
47
- filename=latest_elo_file.split("/")[-1],
48
- repo_type="space",
49
  )
50
 
51
- with open(latest_elo_file_local, "rb") as fin:
52
- elo_results = pickle.load(fin)
53
-
54
- arena_dfs = {}
55
- for k in key_to_category_name.keys():
56
- if k not in elo_results:
57
- continue
58
- arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
59
-
60
- # gather open llm leaderboard data
61
- LEADERBOARD_DATA_FILES = "spaces/lmsys/chatbot-arena-leaderboard/*.csv"
62
- leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)
63
- latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[
64
- 0
65
- ]
66
-
67
- latest_leaderboard_file_local = hf_hub_download(
68
- repo_id="lmsys/chatbot-arena-leaderboard",
69
- filename=latest_leaderboard_file.split("/")[-1],
70
- repo_type="space",
71
  )
72
- leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
73
-
74
- ###################
75
- ### Prepare Data
76
- ###################
77
-
78
- # merge leaderboard data with ELO data
79
- merged_dfs = {}
80
- for k, v in arena_dfs.items():
81
- merged_dfs[k] = (
82
- pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
83
- .sort_values("rating", ascending=False)
84
- .reset_index(drop=True)
85
- )
86
 
87
- # add release dates into the merged data
88
- release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
89
- for k, v in merged_dfs.items():
90
- merged_dfs[k] = pd.merge(
91
- merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
92
- )
93
  df = merged_dfs["Overall"]
94
- y_min = df["rating"].min()
95
- y_max = df["rating"].max()
96
- y_buffer = (y_max - y_min) * 0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- with gr.Blocks() as demo:
99
- gr.Markdown("# Chatbot Arena Leaderboard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  with gr.Row():
101
- gr.ScatterPlot(
102
- df,
103
- title="hello",
104
- x="Release Date",
105
- y="rating",
106
- tooltip=["Model", "rating", "num_battles", "Organization", "License"],
107
- width=1000,
108
- height=700,
109
- x_label_angle=-45,
110
- y_lim=[y_min - y_buffer, y_max + y_buffer],
 
 
 
111
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- if __name__ == "__main__":
114
- demo.launch()
 
2
 
3
  import pandas as pd
4
  import gradio as gr
5
+ import plotly.express as px
6
+
7
+ from utils import (
8
+ KEY_TO_CATEGORY_NAME,
9
+ PROPRIETARY_LICENSES,
10
+ download_latest_data_from_space,
11
+ )
12
+
13
+ # with gr.NO_RELOAD:
14
+ ###################
15
+ ### Load Data
16
+ ###################
17
+
18
+ # gather ELO data
19
+ latest_elo_file_local = download_latest_data_from_space(
20
+ repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
21
+ )
22
+
23
+ with open(latest_elo_file_local, "rb") as fin:
24
+ elo_results = pickle.load(fin)
25
+
26
+ arena_dfs = {}
27
+ for k in KEY_TO_CATEGORY_NAME.keys():
28
+ if k not in elo_results:
29
+ continue
30
+ arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]
31
+
32
+ # gather open llm leaderboard data
33
+ latest_leaderboard_file_local = download_latest_data_from_space(
34
+ repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
35
+ )
36
+ leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
37
+
38
+ ###################
39
+ ### Prepare Data
40
+ ###################
41
+
42
+ # merge leaderboard data with ELO data
43
+ merged_dfs = {}
44
+ for k, v in arena_dfs.items():
45
+ merged_dfs[k] = (
46
+ pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
47
+ .sort_values("rating", ascending=False)
48
+ .reset_index(drop=True)
49
  )
50
 
51
+ # add release dates into the merged data
52
+ release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
53
+ for k, v in merged_dfs.items():
54
+ merged_dfs[k] = pd.merge(
55
+ merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
 
 
 
 
 
58
  df = merged_dfs["Overall"]
59
+ df["License"] = df["License"].apply(
60
+ lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
61
+ )
62
+ df["Release Date"] = pd.to_datetime(df["Release Date"])
63
+ df["Month-Year"] = df["Release Date"].dt.to_period("M")
64
+ df["rating"] = df["rating"].round()
65
+
66
+
67
+ ###################
68
+ ### Plot Data
69
+ ###################
70
+
71
+ date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
72
+ min_elo_score = df["rating"].min().round()
73
+ max_elo_score = df["rating"].max().round()
74
+ upper_models_per_month = int(
75
+ df.groupby(["Month-Year", "License"])["rating"].apply(lambda x: x.count()).max()
76
+ )
77
+
78
+
79
+ def build_plot(min_score, max_models_per_month, toggle_annotations):
80
+
81
+ filtered_df = df[(df["rating"] >= min_score)]
82
+ filtered_df = (
83
+ filtered_df.groupby(["Month-Year", "License"])
84
+ .apply(lambda x: x.nlargest(max_models_per_month, "rating"))
85
+ .reset_index(drop=True)
86
+ )
87
+
88
+ fig = px.scatter(
89
+ filtered_df,
90
+ x="Release Date",
91
+ y="rating",
92
+ color="License",
93
+ hover_name="Model",
94
+ hover_data=["Organization", "License"],
95
+ trendline="ols",
96
+ title=f"Proprietary vs Open LLMs (LMSYS Arena ELO as of {date_updated})",
97
+ labels={"rating": "Arena ELO", "Release Date": "Release Date"},
98
+ height=700,
99
+ template="seaborn",
100
+ )
101
+
102
+ fig.update_traces(marker=dict(size=10, opacity=0.6))
103
 
104
+ if toggle_annotations:
105
+ # get the points to annotate (only the highest rated model per month per license)
106
+ idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[
107
+ "rating"
108
+ ].idxmax()
109
+ points_to_annotate_df = filtered_df.loc[idx_to_annotate]
110
+
111
+ for i, row in points_to_annotate_df.iterrows():
112
+ fig.add_annotation(
113
+ x=row["Release Date"],
114
+ y=row["rating"],
115
+ text=row["Model"],
116
+ showarrow=True,
117
+ arrowhead=0,
118
+ )
119
+
120
+ return fig
121
+
122
+
123
+ demo = gr.Blocks()
124
+
125
+ with demo:
126
+ gr.Markdown("# Proprietary vs Open LLMs (LMSYS Arena ELO)")
127
  with gr.Row():
128
+ min_score = gr.Slider(
129
+ minimum=min_elo_score,
130
+ maximum=max_elo_score,
131
+ value=800,
132
+ step=50,
133
+ label="Minimum ELO Score",
134
+ )
135
+ max_models_per_month = gr.Slider(
136
+ value=upper_models_per_month,
137
+ minimum=1,
138
+ maximum=upper_models_per_month,
139
+ step=1,
140
+ label="Max Models per Month (per License)",
141
  )
142
+ toggle_annotations = gr.Radio(
143
+ choices=[True, False], label="Overlay Best Model Name", value=False
144
+ )
145
+
146
+ # Show plot
147
+ plot = gr.Plot()
148
+ demo.load(
149
+ fn=build_plot,
150
+ inputs=[min_score, max_models_per_month, toggle_annotations],
151
+ outputs=plot,
152
+ )
153
+ min_score.change(
154
+ fn=build_plot,
155
+ inputs=[min_score, max_models_per_month, toggle_annotations],
156
+ outputs=plot,
157
+ )
158
+ max_models_per_month.change(
159
+ fn=build_plot,
160
+ inputs=[min_score, max_models_per_month, toggle_annotations],
161
+ outputs=plot,
162
+ )
163
+ toggle_annotations.change(
164
+ fn=build_plot,
165
+ inputs=[min_score, max_models_per_month, toggle_annotations],
166
+ outputs=plot,
167
+ )
168
 
169
+ demo.launch()
170
+ # if __name__ == "__main__":
release_date_mapping.json CHANGED
@@ -7,7 +7,7 @@
7
  {
8
  "key": "gpt-4-1106-preview",
9
  "Model": "GPT-4-1106-preview",
10
- "Release Date": "2024-11-06"
11
  },
12
  {
13
  "key": "claude-3-opus-20240229",
 
7
  {
8
  "key": "gpt-4-1106-preview",
9
  "Model": "GPT-4-1106-preview",
10
+ "Release Date": "2023-11-06"
11
  },
12
  {
13
  "key": "claude-3-opus-20240229",
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  huggingface_hub
2
  pandas
3
  plotly
4
- gradio
 
 
1
  huggingface_hub
2
  pandas
3
  plotly
4
+ gradio
5
+ statsmodels
utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from huggingface_hub import HfFileSystem, hf_hub_download
4
+
5
+ KEY_TO_CATEGORY_NAME = {
6
+ "full": "Overall",
7
+ "coding": "Coding",
8
+ "long_user": "Longer Query",
9
+ "english": "English",
10
+ "chinese": "Chinese",
11
+ "french": "French",
12
+ "no_tie": "Exclude Ties",
13
+ "no_short": "Exclude Short Query (< 5 tokens)",
14
+ "no_refusal": "Exclude Refusal",
15
+ }
16
+ CAT_NAME_TO_EXPLANATION = {
17
+ "Overall": "Overall Questions",
18
+ "Coding": "Coding: whether conversation contains code snippets",
19
+ "Longer Query": "Longer Query (>= 500 tokens)",
20
+ "English": "English Prompts",
21
+ "Chinese": "Chinese Prompts",
22
+ "French": "French Prompts",
23
+ "Exclude Ties": "Exclude Ties and Bothbad",
24
+ "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
25
+ "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
26
+ }
27
+
28
+ PROPRIETARY_LICENSES = [
29
+ "Proprietary",
30
+ ]
31
+
32
+
33
+ def download_latest_data_from_space(
34
+ repo_id: str, file_type: Literal["pkl", "csv"]
35
+ ) -> str:
36
+ """
37
+ Downloads the latest data file of the specified file type from the given repository space.
38
+
39
+ Args:
40
+ repo_id (str): The ID of the repository space.
41
+ file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".
42
+
43
+ Returns:
44
+ str: The local file path of the downloaded data file.
45
+ """
46
+
47
+ def extract_date(filename):
48
+ return filename.split("/")[-1].split(".")[0].split("_")[-1]
49
+
50
+ fs = HfFileSystem()
51
+ data_file_path = f"spaces/{repo_id}/*.{file_type}"
52
+ files = fs.glob(data_file_path)
53
+ latest_file = sorted(files, key=extract_date, reverse=True)[0]
54
+
55
+ latest_filepath_local = hf_hub_download(
56
+ repo_id=repo_id,
57
+ filename=latest_file.split("/")[-1],
58
+ repo_type="space",
59
+ )
60
+ return latest_filepath_local