File size: 4,395 Bytes
f90ad24
 
 
 
 
b2c063a
f90ad24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2c063a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f90ad24
 
59c748f
f90ad24
 
fcb01e3
f90ad24
 
 
b2c063a
 
 
 
 
 
f90ad24
fcb01e3
59c748f
fcb01e3
b2c063a
fcb01e3
59c748f
b2c063a
f90ad24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb01e3
f90ad24
fcb01e3
 
f90ad24
 
fcb01e3
 
 
f90ad24
 
 
 
 
59c748f
fcb01e3
f90ad24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import shutil
import numpy as np
import gradio as gr
from huggingface_hub import Repository, HfApi
from transformers import AutoConfig, AutoModel
import json
from apscheduler.schedulers.background import BackgroundScheduler
import pandas as pd
import datetime
import glob
from dataclasses import dataclass
from typing import List, Tuple, Dict
# clone / pull the lmeh eval data
H4_TOKEN = os.environ.get("H4_TOKEN", None)
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"

METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
BENCH_TO_NAME = {
    "arc_challenge":"ARC (25-shot) ⬆️",
     "hellaswag":"HellaSwag (10-shot) ⬆️",
     "hendrycks":"MMLU (5-shot) ⬆️",
     "truthfulqa_mc":"TruthQA (0-shot) ⬆️",
}
def make_clickable_model(model_name):
    # remove user from model name
    #model_name_show = ' '.join(model_name.split('/')[1:])

    link = "https://huggingface.co/" + model_name
    return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

def get_n_params(base_model):
    return "unknown"
    
    # WARNING: High memory usage

    # Retrieve the number of parameters from the configuration
    try:
        config = AutoConfig.from_pretrained(base_model, use_auth_token=True, low_cpu_mem_usage=True)
        n_params = AutoModel.from_config(config).num_parameters()
    except Exception as e:
        print(f"Error:{e} The number of parameters is not available in the config for the model '{base_model}'.")
        return "unknown"

    return str(n_params)

@dataclass
class EvalResult:
    eval_name : str
    org : str
    model : str
    revision : str
    is_8bit : bool
    results : dict
    
    def to_dict(self):
        
        if self.org is not None:
            base_model =f"{self.org}/{self.model}"
        else:
            base_model =f"{self.model}"
        data_dict = {}
        
        data_dict["eval_name"] = self.eval_name
        data_dict["8bit"] = self.is_8bit
        data_dict["base_model"] = make_clickable_model(base_model)
        data_dict["revision"] = self.revision
        data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
        data_dict["# params"] = get_n_params(base_model)
        
        for benchmark in BENCHMARKS:
            if not benchmark in self.results.keys():
                self.results[benchmark] = None
                
        for k,v in BENCH_TO_NAME.items():
            data_dict[v] = self.results[k]
        
        return data_dict
        
        
   
   
def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
    with open(json_filepath) as fp:
        data = json.load(fp)

    path_split = json_filepath.split("/")
    org = None
    model = path_split[-4]
    is_8bit = path_split[-2] == "8bit"
    revision = path_split[-3]
    if len(path_split)== 6:
        # handles gpt2 type models that don't have an org
        result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
    else:
        result_key = f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
        org = path_split[-5]
        
    eval_result = None
    for benchmark, metric  in zip(BENCHMARKS, METRICS):
        if benchmark in json_filepath:
            accs = np.array([v[metric] for k, v in data["results"].items()])
            mean_acc = round(np.mean(accs),3)
            eval_result = EvalResult(result_key, org, model, revision, is_8bit, {benchmark:mean_acc})
        
    return result_key, eval_result
        
    
    
   
def get_eval_results() -> List[EvalResult]:
    json_filepaths = glob.glob("evals/eval_results/**/*.json", recursive=True)
    eval_results = {}
    
    for json_filepath in json_filepaths:
        result_key, eval_result = parse_eval_result(json_filepath)
        if result_key in eval_results.keys():
            eval_results[result_key].results.update(eval_result.results)
        else:
            eval_results[result_key] = eval_result
        
        
    eval_results = [v for k,v in eval_results.items()]
    
    return eval_results
    
def get_eval_results_dicts() -> List[Dict]:
    eval_results = get_eval_results()
    
    return [e.to_dict() for e in eval_results]

eval_results_dict = get_eval_results_dicts()
print(eval_results_dict)