Gregor Betz
commited on
Commit
•
13e8963
1
Parent(s):
a1d2608
initial code upload
Browse files- .gitignore +1 -0
- README.md +1 -1
- app.py +109 -0
- backend/data.py +139 -0
- backend/envs.py +19 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.mypy_cache
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Open Cot Dashboard
|
3 |
-
emoji:
|
4 |
colorFrom: yellow
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
title: Open Cot Dashboard
|
3 |
+
emoji: 📊
|
4 |
colorFrom: yellow
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr # type: ignore
|
2 |
+
import plotly.express as px # type: ignore
|
3 |
+
|
4 |
+
from backend.data import load_cot_data
|
5 |
+
from backend.envs import API, REPO_ID, TOKEN
|
6 |
+
|
7 |
+
logo1_url = "https://raw.githubusercontent.com/logikon-ai/cot-eval/main/assets/AI2_Logo_Square.png"
|
8 |
+
logo2_url = "https://raw.githubusercontent.com/logikon-ai/cot-eval/main/assets/logo_logikon_notext_withborder.png"
|
9 |
+
LOGOS = f'<div style="display: flex; justify-content: center;"><a href="https://allenai.org/"><img src="{logo1_url}" alt="AI2" style="width: 30vw; min-width: 20px; max-width: 60px;"></a> <a href="https://logikon.ai"><img src="{logo2_url}" alt="Logikon AI" style="width: 30vw; min-width: 20px; max-width: 60px; margin-left: 10px;"></a></div>'
|
10 |
+
|
11 |
+
TITLE = f'<h1 align="center" id="space-title"> Open CoT Dashboard</h1> {LOGOS}'
|
12 |
+
|
13 |
+
INTRODUCTION_TEXT = """
|
14 |
+
Baseline accuracies and marginal accuracy gains for specific models and CoT regimes from the [Open CoT Leaderboard](https://huggingface.co/spaces/logikon/open_cot_leaderboard).
|
15 |
+
"""
|
16 |
+
|
17 |
+
def restart_space():
|
18 |
+
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
19 |
+
|
20 |
+
try:
|
21 |
+
df_cot_err, df_cot_regimes = load_cot_data()
|
22 |
+
except Exception:
|
23 |
+
restart_space()
|
24 |
+
|
25 |
+
|
26 |
+
def plot_evals(model_id, plotly_mode, request: gr.Request):
|
27 |
+
df = df_cot_err.copy()
|
28 |
+
if request and "model" in request.query_params:
|
29 |
+
model_param = request.query_params["model"]
|
30 |
+
if model_param in df.model.to_list():
|
31 |
+
model_id = model_param
|
32 |
+
df["selected"] = df_cot_err.model.apply(lambda x: "selected" if x==model_id else "-")
|
33 |
+
#df.sort_values(["selected", "model"], inplace=True, ascending=True) # has currently no effect with px.scatter
|
34 |
+
template = "plotly_dark" if plotly_mode=="dark" else "plotly"
|
35 |
+
fig = px.scatter(df, x="base accuracy", y="marginal acc. gain", color="selected", symbol="model",
|
36 |
+
facet_col="task", facet_col_wrap=3,
|
37 |
+
category_orders={"selected": ["selected", "-"]},
|
38 |
+
color_discrete_sequence=["Orange", "Gray"],
|
39 |
+
template=template,
|
40 |
+
error_y="acc_gain-err", hover_data=['model', "cot accuracy"],
|
41 |
+
width=1200, height=700)
|
42 |
+
|
43 |
+
fig.update_layout(
|
44 |
+
title={"automargin": True},
|
45 |
+
)
|
46 |
+
return fig, model_id
|
47 |
+
|
48 |
+
def get_model_table(model_id):
|
49 |
+
|
50 |
+
def make_pretty(styler):
|
51 |
+
styler.hide(axis="index")
|
52 |
+
styler.format(precision=1),
|
53 |
+
styler.background_gradient(
|
54 |
+
axis=None,
|
55 |
+
subset=["acc_base", "acc_cot"],
|
56 |
+
vmin=20, vmax=100, cmap="YlGnBu"
|
57 |
+
)
|
58 |
+
styler.background_gradient(
|
59 |
+
axis=None,
|
60 |
+
subset=["acc_gain"],
|
61 |
+
vmin=-20, vmax=20, cmap="coolwarm"
|
62 |
+
)
|
63 |
+
styler.set_table_styles({
|
64 |
+
'task': [{'selector': '',
|
65 |
+
'props': [('font-weight', 'bold')]}],
|
66 |
+
'B': [{'selector': 'td',
|
67 |
+
'props': 'color: blue;'}]
|
68 |
+
}, overwrite=False)
|
69 |
+
return styler
|
70 |
+
|
71 |
+
df_cot_model = df_cot_regimes[df_cot_regimes.model.eq(model_id)][['task', 'cot_chain', 'best_of',
|
72 |
+
'temperature', 'top_k', 'top_p', 'acc_base', 'acc_cot', 'delta_abs']]
|
73 |
+
|
74 |
+
df_cot_model = df_cot_model \
|
75 |
+
.rename(columns={"temperature": "temp"}) \
|
76 |
+
.replace({'cot_chain': 'ReflectBeforeRun'}, "Reflect") \
|
77 |
+
.sort_values(["task", "cot_chain"]) \
|
78 |
+
.reset_index(drop=True)
|
79 |
+
|
80 |
+
return df_cot_model.style.pipe(make_pretty)
|
81 |
+
|
82 |
+
def styled_model_table(model_id, request: gr.Request):
|
83 |
+
if request and "model" in request.query_params:
|
84 |
+
model_param = request.query_params["model"]
|
85 |
+
if model_param in df_cot_regimes.model.to_list():
|
86 |
+
model_id = model_param
|
87 |
+
return get_model_table(model_id)
|
88 |
+
|
89 |
+
|
90 |
+
demo = gr.Blocks()
|
91 |
+
|
92 |
+
with demo:
|
93 |
+
|
94 |
+
gr.HTML(TITLE)
|
95 |
+
gr.Markdown(INTRODUCTION_TEXT)
|
96 |
+
with gr.Row():
|
97 |
+
model_list = gr.Dropdown(list(df_cot_err.model.unique()), value="allenai/tulu-2-70b", label="Model", scale=2)
|
98 |
+
plotly_mode = gr.Radio(["dark","light"], value="dark", label="Plot theme", scale=1)
|
99 |
+
submit = gr.Button("Update", scale=1)
|
100 |
+
table = gr.DataFrame()
|
101 |
+
plot = gr.Plot(label="evals")
|
102 |
+
|
103 |
+
|
104 |
+
submit.click(plot_evals, [model_list, plotly_mode], [plot, model_list])
|
105 |
+
submit.click(styled_model_table, model_list, table)
|
106 |
+
demo.load(plot_evals, [model_list, plotly_mode], [plot, model_list])
|
107 |
+
demo.load(styled_model_table, model_list, table)
|
108 |
+
|
109 |
+
demo.launch()
|
backend/data.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
|
4 |
+
import datasets # type: ignore
|
5 |
+
from huggingface_hub import snapshot_download # type: ignore
|
6 |
+
import pandas as pd # type: ignore
|
7 |
+
|
8 |
+
from backend.envs import EVAL_DATASET, TRACES_DATASET, TOKEN, EVAL_RESULTS_PATH
|
9 |
+
|
10 |
+
|
11 |
+
SUBSETS = ["base","cot","orig"]
|
12 |
+
|
13 |
+
|
14 |
+
def load_cot_data():
|
15 |
+
|
16 |
+
####
|
17 |
+
# Load the evaluation results data
|
18 |
+
####
|
19 |
+
|
20 |
+
# download raw data
|
21 |
+
snapshot_download(
|
22 |
+
repo_id=EVAL_DATASET,
|
23 |
+
revision="main",
|
24 |
+
local_dir=EVAL_RESULTS_PATH,
|
25 |
+
repo_type="dataset",
|
26 |
+
max_workers=60,
|
27 |
+
token=TOKEN
|
28 |
+
)
|
29 |
+
|
30 |
+
# get all models for which results are stored
|
31 |
+
models = []
|
32 |
+
for path in glob.glob(f"{EVAL_RESULTS_PATH}/data/*/*", recursive=False):
|
33 |
+
models.append(path.replace(f"{EVAL_RESULTS_PATH}/data/",""))
|
34 |
+
|
35 |
+
# load the evaluation results and create a dataframe
|
36 |
+
results = []
|
37 |
+
for model in models:
|
38 |
+
for subset in SUBSETS:
|
39 |
+
result_files = glob.glob(f"{EVAL_RESULTS_PATH}/data/{model}/{subset}/**/*.json", recursive=True)
|
40 |
+
for json_filepath in result_files:
|
41 |
+
with open(json_filepath) as fp:
|
42 |
+
data = json.load(fp)
|
43 |
+
if "results" in data.keys():
|
44 |
+
for k,v in data["results"].items():
|
45 |
+
record = v.copy()
|
46 |
+
record["model"] = model
|
47 |
+
record["subset"] = subset
|
48 |
+
results.append(record)
|
49 |
+
|
50 |
+
df_results = pd.DataFrame(results)
|
51 |
+
del results
|
52 |
+
|
53 |
+
# postprocess task/config data
|
54 |
+
def split_alias(alias: str) -> pd.Series:
|
55 |
+
if alias[-5:]=="_base":
|
56 |
+
alias = alias[:-5]
|
57 |
+
elif alias[-4:]=="_cot":
|
58 |
+
alias = alias[:-4]
|
59 |
+
|
60 |
+
if "_" not in alias:
|
61 |
+
task = alias
|
62 |
+
config = ""
|
63 |
+
else:
|
64 |
+
config, task = alias.split("_")
|
65 |
+
|
66 |
+
return pd.Series({"task": task, "config": config})
|
67 |
+
|
68 |
+
df_results = pd.concat([df_results, df_results.alias.apply(split_alias)], axis=1)
|
69 |
+
|
70 |
+
# baseline accuracies in separete df
|
71 |
+
df_baseline = df_results[df_results.subset.eq("base")].groupby(["model","task"])[["acc,none"]].mean()
|
72 |
+
|
73 |
+
# build cot eval df with baseline accuracies in separate column
|
74 |
+
df_tmp1 = df_results[df_results.subset.eq("cot")].sort_values(by=["model","task","config"])
|
75 |
+
df_tmp1.reset_index(inplace=True, drop=True)
|
76 |
+
|
77 |
+
df_cot = df_tmp1[["model","task","config"]].copy()
|
78 |
+
df_cot["acc_cot"] = df_tmp1["acc,none"]
|
79 |
+
df_cot["acc_base"] = df_cot.apply(lambda row: df_baseline.loc[(row.model, row.task)]["acc,none"], axis=1)
|
80 |
+
|
81 |
+
df_cot["acc_gain"] = df_cot.acc_cot - df_cot.acc_base
|
82 |
+
df_cot["delta_rel"] = (df_cot.acc_cot - df_cot.acc_base)/df_cot.acc_base
|
83 |
+
|
84 |
+
# average eval results for all tasks in extra df
|
85 |
+
df_cot_avg = df_cot.groupby(["model","config"]).mean(numeric_only=True).reset_index()
|
86 |
+
df_cot_avg["task"] = "all"
|
87 |
+
|
88 |
+
# add average results to cot df
|
89 |
+
df_cot = pd.concat([df_cot_avg, df_cot], ignore_index=True)
|
90 |
+
|
91 |
+
|
92 |
+
####
|
93 |
+
# Load the traces data
|
94 |
+
####
|
95 |
+
|
96 |
+
# load traces data and extract configs
|
97 |
+
dataset = datasets.load_dataset(TRACES_DATASET, split="test", token=TOKEN)
|
98 |
+
dataset = dataset.select_columns(["config_data"])
|
99 |
+
df_cottraces = pd.DataFrame({"config_data": dataset["config_data"]})
|
100 |
+
del dataset
|
101 |
+
config_data = []
|
102 |
+
for data in df_cottraces.config_data.to_list():
|
103 |
+
config_data.append(dict(data))
|
104 |
+
del df_cottraces
|
105 |
+
df_cotconfigs = pd.DataFrame(config_data)
|
106 |
+
df_cotconfigs.drop_duplicates(inplace=True, ignore_index=True)
|
107 |
+
df_cotconfigs
|
108 |
+
|
109 |
+
# add cot configs data to df_cot
|
110 |
+
def select_config_data(row):
|
111 |
+
df_selected = df_cotconfigs[df_cotconfigs.name.eq(row.config) & df_cotconfigs.model.eq(row.model)]
|
112 |
+
if len(df_selected) == 0:
|
113 |
+
print(f"Config {row.config} not found for model {row.model}")
|
114 |
+
return None
|
115 |
+
return df_selected.drop(columns=["name", "model", "task"]).iloc[0]
|
116 |
+
|
117 |
+
df_cot = pd.concat(
|
118 |
+
[
|
119 |
+
df_cot,
|
120 |
+
df_cot.apply(select_config_data, axis=1)
|
121 |
+
],
|
122 |
+
axis=1
|
123 |
+
)
|
124 |
+
|
125 |
+
# accuracy values in percent
|
126 |
+
for col in ['acc_base', 'acc_cot', 'acc_gain']:
|
127 |
+
df_cot[col] = 100 * df_cot[col]
|
128 |
+
|
129 |
+
####
|
130 |
+
# Create error dataframe
|
131 |
+
####
|
132 |
+
|
133 |
+
df_cot_err = df_cot.groupby(["model","task"]).agg({'acc_gain': ['mean', 'min', 'max'], "acc_base": "mean", "acc_cot": "mean"})
|
134 |
+
df_cot_err.columns = ['-'.join(col).strip() for col in df_cot_err.columns.values]
|
135 |
+
df_cot_err["acc_gain-err"] = 0.5 * (df_cot_err["acc_gain-max"] - df_cot_err["acc_gain-min"])
|
136 |
+
df_cot_err.reset_index(inplace=True)
|
137 |
+
df_cot_err.rename(columns={"acc_base-mean": "base accuracy", "acc_cot-mean": "cot accuracy", "acc_gain-mean": "marginal acc. gain"}, inplace=True)
|
138 |
+
|
139 |
+
return df_cot_err, df_cot
|
backend/envs.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi # type: ignore
|
4 |
+
|
5 |
+
|
6 |
+
# clone / pull the lmeh eval data
|
7 |
+
TOKEN = os.environ.get("TOKEN", None)
|
8 |
+
|
9 |
+
OWNER = "cot-leaderboard"
|
10 |
+
REPO_ID = f"{OWNER}/open_cot_dashboard"
|
11 |
+
EVAL_DATASET = f"{OWNER}/cot-eval-results"
|
12 |
+
TRACES_DATASET = f"{OWNER}/cot-eval-traces-2.0"
|
13 |
+
|
14 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
15 |
+
|
16 |
+
# Local caches
|
17 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "cot-eval-results")
|
18 |
+
|
19 |
+
API = HfApi(token=TOKEN)
|