|
from dataclasses import dataclass |
|
from enum import Enum |
|
|
|
|
|
@dataclass |
|
class Task: |
|
benchmark: str |
|
metric: str |
|
col_name: str |
|
category: str |
|
higher_is_better: bool = True |
|
scale_by_100: bool = True |
|
|
|
|
|
|
|
|
|
class Tasks(Enum): |
|
|
|
task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg", "NLU") |
|
task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso", "NLU") |
|
task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C", "CFK") |
|
task4 = Task("belebele_ita", "acc_norm,none", "Belebele", "NLU") |
|
task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing", "BFS") |
|
task12 = Task("haspeede2_hs", "f1,none", "HaSpeeDe2 HS", "BFS") |
|
task13 = Task("haspeede2_stereo", "f1,none", "HaSpeeDe2 Stereo", "BFS") |
|
task5 = Task("hatecheck_ita", "f1,none", "HateCheck", "BFS") |
|
task6 = Task("honest_ita", "acc,none", "HONEST", "BFS", higher_is_better=False) |
|
task14 = Task("ironita_irony", "f1,none", "IronITA Irony", "NLU") |
|
task15 = Task("ironita_sarcasm", "f1,none", "IronITA Sarcasm", "NLU") |
|
task7 = Task("itacola", "mcc,none", "ItaCoLA", "NLU", scale_by_100=False) |
|
task8 = Task("news_sum", "bertscore,none", "News Sum", "NLU") |
|
task16 = Task("sentipolc", "f1,none", "SENTIPOLC", "NLU") |
|
task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it", "CFK") |
|
task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA", "CFK") |
|
task11 = Task("xcopa_it", "acc,none", "XCOPA", "CFK") |
|
task17 = Task("hellaswag_ita", "acc_norm,none", "Hellaswag-it", "CFK") |
|
|
|
|
|
NUM_FEWSHOT = 0 |
|
|
|
|
|
|
|
|
|
TITLE = """<h1 align="center" id="space-title">ItaEval leaderboard</h1>""" |
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
This leaderboard evaluates language models on <b>ItaEval</b>, a new unified benchmark for Italian. |
|
|
|
Submit your model: [Google Form](https://forms.gle/xpGH66DpVRcCmdcJ6) |
|
|
|
Some information: |
|
- Unlike other leaderboards you may find online, we do not support automatic evaluation for new model submissions. Currently, we are taking care of running models on the suite. Please fill out the form above to have your model evaluated and included here. |
|
- You can find some more details on the suite in our [technical report](https://bit.ly/itaeval_tweetyita_v1) |
|
""" |
|
|
|
ITA_EVAL_REPO = "https://github.com/g8a9/ita-eval" |
|
|
|
|
|
LLM_BENCHMARKS_TEXT = f""" |
|
## How it works |
|
|
|
## Reproducibility |
|
Head to {ITA_EVAL_REPO} to reproduce our results for all the instructions. |
|
|
|
If all the setup goes smoothly, you can run 'MODEL' on ItaEval with: |
|
```bash |
|
MODEL="your-model-id-on-the-huggingface-hub" |
|
lm_eval --model hf \ |
|
--model_args pretrained=$MODEL,dtype=bfloat16 \ |
|
--tasks ita_eval \ |
|
--batch_size 1 \ |
|
--log_samples \ |
|
--output_path "." |
|
``` |
|
""" |
|
|
|
EVALUATION_QUEUE_TEXT = """ |
|
We do not plan to accept autonomous submissions, yet. Fill [this form](https://forms.gle/xpGH66DpVRcCmdcJ6) to have your model evaluated. |
|
""" |
|
|
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
CITATION_BUTTON_TEXT = r""" |
|
We are working on it! :) |
|
""" |
|
|