Spaces:

andrewrreed
/

closed-vs-open-arena-elo

Running on CPU Upgrade

App Files Files Community

closed-vs-open-arena-elo / app.py

andrewrreed HF staff

cleanup

05f8ae5 7 months ago

raw

history blame

6.99 kB

	import pickle

	import pandas as pd
	import gradio as gr
	import plotly.express as px

	from utils import (
	KEY_TO_CATEGORY_NAME,
	PROPRIETARY_LICENSES,
	CAT_NAME_TO_EXPLANATION,
	download_latest_data_from_space,
	get_constants,
	)

	###################
	### Load Data
	###################

	# gather ELO data
	latest_elo_file_local = download_latest_data_from_space(
	repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
	)

	with open(latest_elo_file_local, "rb") as fin:
	elo_results = pickle.load(fin)

	arena_dfs = {}
	for k in KEY_TO_CATEGORY_NAME.keys():
	if k not in elo_results:
	continue
	arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]

	# gather open llm leaderboard data
	latest_leaderboard_file_local = download_latest_data_from_space(
	repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
	)
	leaderboard_df = pd.read_csv(latest_leaderboard_file_local)

	###################
	### Prepare Data
	###################

	# merge leaderboard data with ELO data
	merged_dfs = {}
	for k, v in arena_dfs.items():
	merged_dfs[k] = (
	pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
	.sort_values("rating", ascending=False)
	.reset_index(drop=True)
	)

	# add release dates into the merged data
	release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
	for k, v in merged_dfs.items():
	merged_dfs[k] = pd.merge(
	merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
	)


	# format dataframes
	def format_data(df):
	df["License"] = df["License"].apply(
	lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
	)
	df["Release Date"] = pd.to_datetime(df["Release Date"])
	df["Month-Year"] = df["Release Date"].dt.to_period("M")
	df["rating"] = df["rating"].round()
	return df.reset_index(drop=True)


	merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()}


	# get constants
	min_elo_score, max_elo_score, upper_models_per_month = get_constants(merged_dfs)

	date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]


	###################
	### Plot Data
	###################


	def get_data_split(dfs, set_name):
	df = dfs[set_name].copy(deep=True)
	return df.reset_index(drop=True)


	def build_plot(min_score, max_models_per_month, toggle_annotations, set_selector):

	df = get_data_split(merged_dfs, set_name=set_selector)

	# filter data
	filtered_df = df[(df["rating"] >= min_score)]

	filtered_df = (
	filtered_df.groupby(["Month-Year", "License"])
	.apply(lambda x: x.nlargest(max_models_per_month, "rating"))
	.reset_index(drop=True)
	)

	fig = px.scatter(
	filtered_df,
	x="Release Date",
	y="rating",
	color="License",
	hover_name="Model",
	hover_data=["Organization", "License", "Link"],
	trendline="ols",
	title=f"Open vs Proprietary LLMs by LMSYS Arena ELO Score (as of {date_updated})",
	labels={"rating": "Arena ELO", "Release Date": "Release Date"},
	height=800,
	template="seaborn",
	)

	fig.update_traces(marker=dict(size=10, opacity=0.6))

	if toggle_annotations:
	# get the points to annotate (only the highest rated model per month per license)
	idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[
	"rating"
	].idxmax()
	points_to_annotate_df = filtered_df.loc[idx_to_annotate]

	for i, row in points_to_annotate_df.iterrows():
	fig.add_annotation(
	x=row["Release Date"],
	y=row["rating"],
	text=row["Model"],
	showarrow=True,
	arrowhead=0,
	)

	return fig


	with gr.Blocks(
	theme=gr.themes.Soft(
	primary_hue=gr.themes.colors.sky,
	secondary_hue=gr.themes.colors.green,
	font=[
	gr.themes.GoogleFont("Open Sans"),
	"ui-sans-serif",
	"system-ui",
	"sans-serif",
	],
	)
	) as demo:
	gr.Markdown(
	"""
	<div style="text-align: center; max-width: 650px; margin: auto;">
	<h1 style="font-weight: 900; margin-top: 5px;">🔬 Progress Tracker: Open vs. Proprietary LLMs
	</h1>
	<p style="text-align: left; margin-top: 10px; margin-bottom: 10px; line-height: 20px;">
	This app visualizes the progress of proprietary and open-source LLMs in the LMSYS Arena ELO leaderboard. The idea is inspired by <a href="https://www.linkedin.com/posts/maxime-labonne_arena-elo-graph-updated-with-new-models-activity-7187062633735368705-u2jB?utm_source=share&utm_medium=member_desktop">this great work</a> from <a href="https://huggingface.co/mlabonne/">Maxime Labonne</a>.
	</p>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column():
	set_selector = gr.Dropdown(
	choices=list(CAT_NAME_TO_EXPLANATION.keys()),
	label="Select Category",
	value="Overall",
	info="Select the category to visualize",
	)
	toggle_annotations = gr.Radio(
	choices=[True, False],
	label="Overlay Best Model Name",
	value=True,
	info="Toggle to overlay the name of the best model per month per license",
	)
	with gr.Column():
	min_score = gr.Slider(
	minimum=min_elo_score,
	maximum=max_elo_score,
	value=(max_elo_score - min_elo_score) * 0.3 + min_elo_score,
	step=50,
	label="Minimum ELO Score",
	info="Filter out low scoring models",
	)
	max_models_per_month = gr.Slider(
	value=upper_models_per_month - 2,
	minimum=1,
	maximum=upper_models_per_month,
	step=1,
	label="Max Models per Month (per License)",
	info="Limit to N best models per month per license to reduce clutter",
	)

	# Show plot
	plot = gr.Plot()
	demo.load(
	fn=build_plot,
	inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
	outputs=plot,
	)
	min_score.change(
	fn=build_plot,
	inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
	outputs=plot,
	)
	max_models_per_month.change(
	fn=build_plot,
	inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
	outputs=plot,
	)
	toggle_annotations.change(
	fn=build_plot,
	inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
	outputs=plot,
	)
	set_selector.change(
	fn=build_plot,
	inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
	outputs=plot,
	)

	demo.launch()