open_dutch_llm_leaderboard

Running

App Files Files Community

open_dutch_llm_leaderboard / app.py

BramVanroy

Update app.py

399342f verified 10 months ago

raw

history blame contribute delete

13.2 kB

	import json
	from dataclasses import dataclass, field, fields
	from functools import cached_property
	from pathlib import Path
	from typing import Literal

	import gradio as gr
	import numpy as np
	import pandas as pd
	import plotly.graph_objects as go
	from pandas import DataFrame
	from pandas.io.formats.style import Styler

	from content import *

	TASK_METRICS = {
	"arc": "acc_norm",
	"hellaswag": "acc_norm",
	"mmlu": "acc_norm",
	"truthfulqa": "mc2",
	}

	MODEL_TYPE_EMOJIS = {
	"pretrained": "🟢",
	"fine-tuned": "🔶",
	"instruction-tuned": "⭕",
	"RL-tuned": "🟦",
	}

	NOT_GIVEN_SYMBOL = "❔"


	@dataclass
	class Result:
	model_name: str
	short_name: str
	model_type: Literal["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]
	dutch_coverage: Literal["none", "pretrained", "fine-tuned"]
	num_parameters: int
	arc: float = field(default=np.nan)
	average: float = field(default=np.nan, init=False)
	hellaswag: float = field(default=np.nan)
	mmlu: float = field(default=np.nan)
	truthfulqa: float = field(default=np.nan)
	num_parameters_kmb: str = field(init=False)

	def __post_init__(self):
	if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]:
	raise ValueError(
	f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned',"
	f" 'instruction-tuned', 'RL-tuned', 'not-given"
	)
	if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]:
	raise ValueError(
	f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given"
	)

	field_names = {f.name for f in fields(self)}
	for task_name in TASK_METRICS:
	if task_name not in field_names:
	raise ValueError(f"Task name {task_name} not found in Result class fields so cannot create DataFrame")

	if any([np.isnan(getattr(self, task_name)) for task_name in TASK_METRICS]):
	self.average = np.nan
	else:
	self.average = sum([getattr(self, task_name) for task_name in TASK_METRICS]) / 4
	self.num_parameters_kmb = convert_number_to_kmb(self.num_parameters)


	@dataclass
	class ResultSet:
	results: list[Result]
	column_names: dict[str, str] = field(default_factory=dict)
	column_types: dict[str, str] = field(default_factory=dict)

	def __post_init__(self):
	if not self.column_names:
	# Order will be the order of the columns in the DataFrame
	self.column_names = {
	"short_name": "Model",
	"model_type": "T",
	"dutch_coverage": "🇳🇱",
	"num_parameters": "Size",
	"average": "Avg.",
	"arc": "ARC (25-shot)",
	"hellaswag": "HellaSwag (10-shot)",
	"mmlu": "MMLU (5-shot)",
	"truthfulqa": "TruthfulQA (0-shot)",
	}
	self.column_types = {
	"Model": "markdown",
	"T": "str",
	"🇳🇱": "str",
	"Size": "str",
	"Avg.": "number",
	"ARC (25-shot)": "number",
	"HellaSwag (10-shot)": "number",
	"MMLU (5-shot)": "number",
	"TruthfulQA (0-shot)": "number",
	}

	for column_type in self.column_types:
	if column_type not in set(self.column_names.values()):
	raise ValueError(
	f"Column names specified in column_types must be values in column_names."
	f" {column_type} not found."
	)

	if "average" not in self.column_names:
	raise ValueError("Column names must contain 'average' column name")

	field_names = [f.name for f in fields(Result)]
	for column_name in self.column_names:
	if column_name not in field_names:
	raise ValueError(f"Column name {column_name} not found in Result class so cannot create DataFrame")

	@cached_property
	def df(self) -> DataFrame:
	data = [
	{col_name: getattr(result, attr) for attr, col_name in self.column_names.items()}
	for result in self.results
	]

	df = pd.DataFrame(data)
	df = df.sort_values(by=self.column_names["average"], ascending=False)
	return df

	@cached_property
	def styled_df(self) -> Styler:
	data = [
	{
	col_name: (
	f"<a target='_blank' href='https://huggingface.co/{result.model_name}'"
	f" style='color: var(--link-text-color); text-decoration: underline;text-decoration-style:"
	f" dotted;'>{result.short_name}</a>"
	)
	if attr == "short_name"
	else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL)
	if attr == "model_type"
	else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL)
	if attr == "dutch_coverage"
	else getattr(result, attr)
	for attr, col_name in self.column_names.items()
	}
	for result in self.results
	]

	df = pd.DataFrame(data)
	df = df.sort_values(by=self.column_names["average"], ascending=False)
	number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"]
	styler = df.style.format("{:.4f}", subset=number_cols, na_rep="<missing>")

	def highlight_max(col):
	return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)

	styler = styler.apply(highlight_max, axis=0, subset=number_cols)
	num_params_col = self.column_names["num_parameters"]
	styler = styler.format(convert_number_to_kmb, subset=num_params_col)
	styler.set_caption("Leaderboard on Dutch benchmarks.")
	styler = styler.hide()
	return styler

	@cached_property
	def latex_df(self) -> Styler:
	number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"]
	styler = self.df.style.format("{:.2f}", subset=number_cols, na_rep="<missing>")

	def highlight_max(col):
	return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)

	styler = styler.apply(highlight_max, axis=0, subset=number_cols)
	num_params_col = self.column_names["num_parameters"]
	styler = styler.format(convert_number_to_kmb, subset=num_params_col)
	styler.set_caption("Leaderboard on Dutch benchmarks.")
	styler = styler.hide()
	return styler

	@cached_property
	def viz_checkboxes(self):
	model_col_name = self.column_names["short_name"]
	avg_col = self.column_names["average"]
	top3_models = self.df.sort_values(by=avg_col, ascending=False)[model_col_name].tolist()[:3]
	return gr.CheckboxGroup(self.df[model_col_name].tolist(), label="Models", value=top3_models)

	def plot(self, model_names: list[str]):
	if not model_names:
	return None

	# Only get task columns and model name
	task_columns = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "short_name"]
	df = self.df[task_columns]

	# Rename the columns to the task names
	reversed_col_names = {v: k for k, v in self.column_names.items() if v != "Model"}
	df = df.rename(columns=reversed_col_names)

	# Only keep the selected models
	df = df[df["Model"].isin(model_names)]

	# Melt the dataframe to long format
	df = df.melt(id_vars=["Model"], var_name="Task", value_name="Score").sort_values(by="Task")

	# Populate figure
	fig = go.Figure()
	for model_name in model_names:
	model_df = df[df["Model"] == model_name]
	scores = model_df["Score"].tolist()
	tasks = model_df["Task"].tolist()

	# Repeat the first point at the end to close the lines
	# Cf. https://community.plotly.com/t/closing-line-for-radar-cart-and-popup-window-on-chart-radar/47711/4
	scores.append(scores[0])
	tasks.append(tasks[0])

	fig.add_trace(go.Scatterpolar(r=scores, theta=tasks, name=model_name))

	fig.update_layout(
	title="Model performance on Dutch benchmarks",
	)

	return fig


	def convert_number_to_kmb(number: int) -> str:
	"""
	Converts a number to a string with K, M or B suffix
	:param number: the number to convert
	:return: a string with the number and a suffix, e.g. "7B", rounded to one decimal
	"""
	if number >= 1_000_000_000:
	return f"{round(number / 1_000_000_000, 1)}B"
	elif number >= 1_000_000:
	return f"{round(number / 1_000_000, 1)}M"
	elif number >= 1_000:
	return f"{round(number / 1_000, 1)}K"
	else:
	return str(number)


	def collect_results() -> ResultSet:
	"""
	Collects results from the evals folder and returns a dictionary of results
	:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
	dictionaries of the form {benchmark_name: performance_score}
	"""
	evals_dir = Path(__file__).parent.joinpath("evals")
	pf_overview = evals_dir.joinpath("models.json")
	if not pf_overview.exists():
	raise ValueError(
	f"Overview file {pf_overview} not found. Make sure to generate it first with `generate_overview_json.py`."
	)

	model_info = json.loads(pf_overview.read_text(encoding="utf-8"))
	model_results = {}
	for pfin in evals_dir.rglob("*.json"):
	data = json.loads(pfin.read_text(encoding="utf-8"))

	if "results" not in data:
	continue

	task_results = data["results"]
	short_name = pfin.stem.split("_", 2)[2].lower()

	if short_name not in model_info:
	raise KeyError(
	f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON"
	f" file exists that has not yet been processed. First run the `generate_overview_json.py` script."
	)

	if short_name not in model_results:
	model_results[short_name] = {
	"short_name": short_name,
	"model_name": model_info[short_name]["model_name"],
	"model_type": model_info[short_name]["model_type"],
	"dutch_coverage": model_info[short_name]["dutch_coverage"],
	"num_parameters": model_info[short_name]["num_parameters"],
	}

	for task_name, task_result in task_results.items():
	task_name = task_name.rsplit("_", 1)[0]
	metric = TASK_METRICS[task_name]
	model_results[short_name][task_name] = task_result[metric]

	model_results = ResultSet([Result(**res) for short_name, res in model_results.items()])

	return model_results


	with gr.Blocks() as demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRO_TEXT)

	gr.Markdown(
	f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!"
	" All models have been benchmarked in 8-bit. `<missing>` values indicate that those benchmarks are still"
	" pending."
	)

	results = collect_results()

	gr.components.Dataframe(
	results.styled_df,
	headers=list(results.df.columns),
	datatype=[results.column_types[col] for col in results.df.columns], # To ensure same order as headers
	interactive=False,
	elem_id="leaderboard-table",
	)

	with gr.Row():
	with gr.Column():
	modeltypes_str = "<br>".join([f"- {emoji}: {modeltype}" for modeltype, emoji in MODEL_TYPE_EMOJIS.items()])
	gr.Markdown(f"Model types:<br>{modeltypes_str}")

	with gr.Column():
	gr.Markdown(
	f"Language coverage ({results.column_names['dutch_coverage']}):"
	f"<br>- `none`: no explicit/deliberate Dutch coverage,"
	f"<br>- `pretrained`: pretrained on Dutch data,"
	f"<br>- `fine-tuned`: fine-tuned on Dutch data"
	)

	with gr.Column():
	metrics_str = "<br>".join([f"- {task}: `{metric}`" for task, metric in TASK_METRICS.items()])
	gr.Markdown(f"Reported metrics:<br>{metrics_str}")

	gr.Markdown("## LaTeX")
	gr.Code(results.latex_df.to_latex(convert_css=True))

	gr.Markdown("## Visualization")
	with gr.Row():
	with gr.Column():
	buttons = results.viz_checkboxes

	with gr.Column(scale=2):
	plot = gr.Plot(container=True)
	buttons.change(results.plot, inputs=buttons, outputs=[plot])
	demo.load(results.plot, inputs=buttons, outputs=[plot])

	gr.Markdown(DISCLAIMER, elem_classes="markdown-text")
	gr.Markdown(CREDIT, elem_classes="markdown-text")
	gr.Markdown(CITATION, elem_classes="markdown-text")


	if __name__ == "__main__":
	demo.launch()