kunato commited on
Commit
baa7db6
0 Parent(s):

init commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Leaderboard
3
+ emoji: 🥇
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.26.0
8
+ app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # Start the configuration
14
+
15
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
16
+
17
+ Results files should have the following format and be stored as json files:
18
+ ```json
19
+ {
20
+ "config": {
21
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
+ "model_name": "path of the model on the hub: org/model",
23
+ "model_sha": "revision on the hub",
24
+ },
25
+ "results": {
26
+ "task_name": {
27
+ "metric_name": score,
28
+ },
29
+ "task_name2": {
30
+ "metric_name": score,
31
+ }
32
+ }
33
+ }
34
+ ```
35
+
36
+ Request files are created automatically by this tool.
37
+
38
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
+
40
+ # Code logic for more complex edits
41
+
42
+ You'll find
43
+ - the main table' columns names and properties in `src/display/utils.py`
44
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
+ - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import gradio as gr
3
+ from apscheduler.schedulers.background import BackgroundScheduler
4
+ from huggingface_hub import snapshot_download
5
+
6
+ from src.pages.about import show_about_page
7
+ from src.pages.submit import show_submit_page
8
+ from src.pages.result_table import show_result_page
9
+ from src.about import (
10
+ CITATION_BUTTON_LABEL,
11
+ CITATION_BUTTON_TEXT,
12
+ INTRODUCTION_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
17
+
18
+
19
+
20
+ def restart_space():
21
+ API.restart_space(repo_id=REPO_ID)
22
+
23
+ try:
24
+ print(EVAL_REQUESTS_PATH)
25
+ snapshot_download(
26
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
27
+ )
28
+ except Exception:
29
+ restart_space()
30
+ try:
31
+ print(EVAL_RESULTS_PATH)
32
+ snapshot_download(
33
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
34
+ )
35
+ except Exception:
36
+ restart_space()
37
+
38
+
39
+ demo = gr.Blocks(css=custom_css)
40
+ with demo:
41
+ gr.HTML(TITLE)
42
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
43
+
44
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
45
+ show_result_page(root_path='MC', title='📝 Exam', index=0)
46
+ show_result_page(root_path='LLM', title='🤖 LLM as Judge', index=1)
47
+ show_result_page(root_path='NLU', title='🕵️ NLU', index=2)
48
+ show_result_page(root_path='NLG', title='🖊️ NLG', index=3)
49
+ show_about_page(index=4)
50
+ show_submit_page(index=5)
51
+
52
+ with gr.Row():
53
+ with gr.Accordion("📙 Citation", open=False):
54
+ citation_button = gr.Textbox(
55
+ value=CITATION_BUTTON_TEXT,
56
+ label=CITATION_BUTTON_LABEL,
57
+ lines=8,
58
+ elem_id="citation-button",
59
+ show_copy_button=True,
60
+ )
61
+
62
+ scheduler = BackgroundScheduler()
63
+ scheduler.add_job(restart_space, "interval", seconds=1800)
64
+ scheduler.start()
65
+ demo.queue(default_concurrency_limit=40).launch()
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler==3.10.1
2
+ black==23.11.0
3
+ click==8.1.3
4
+ gradio
5
+ gradio_client
6
+ huggingface-hub>=0.18.0
7
+ numpy==1.24.2
8
+ pandas==2.0.0
9
+ requests==2.28.2
10
+ tqdm==4.65.0
11
+ transformers==4.35.2
12
+ python-dotenv
src/about.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+
5
+
6
+ NUM_FEWSHOT = 0 # Change with your few shot
7
+ # ---------------------------------------------------
8
+
9
+ TITLE = """<h1>🇹🇭 Thai LLM Leaderboard</h1>"""
10
+
11
+
12
+ # <a href="url"></a>
13
+
14
+ INTRODUCTION_TEXT = """
15
+ The Thai-LLM Leaderboard 🇹🇭 focused on standardizing evaluation methods for large language models (LLMs) in the Thai language based on <a href="https://github.com/SEACrowd">Seacrowd</a>,
16
+ As part of an open community project, we welcome you to submit new evaluation tasks or models.
17
+ This leaderboard is developed in collaboration with <a href="https://www.scb10x.com">SCB 10X</a>, <a href="https://www.vistec.ac.th/">Vistec</a>, and <a href="https://aisingapore.org/">AI-Singapore</a>.
18
+ """
19
+
20
+ LLM_BENCHMARKS_TEXT = f"""
21
+ Evaluations
22
+ The leaderboard currently consists of the following benchmarks:
23
+ - Exam
24
+ - <a href="https://huggingface.co/datasets/scb10x/thai_exam">ThaiExam</a>: ThaiExam is a Thai language benchmark based on examinations for high-school students and investment professionals in Thailand.
25
+ - <a href="https://arxiv.org/abs/2306.05179">M3Exam</a>: M3Exam is a novel benchmark sourced from real and official human exam questions for evaluating LLMsin a multilingual, multimodal, and multilevel context. Here is Thai subset of M3Exam.
26
+ - LLM as Judge
27
+ - Thai MT-Bench: <a href="https://arxiv.org/abs/2306.05685">MT-Bench</a> inspired varient of LLM as judges specifically developed by Vistec for Thai language and cultural.
28
+ - NLU
29
+ - <a href="https://huggingface.co/datasets/facebook/belebele">Belebele</a>: Belebele is a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. Here is Thai subset of Belebele.
30
+ - <a href="https://huggingface.co/datasets/facebook/xnli">XNLI</a>: XNLI is an evaluation corpus for language transfer and cross-lingual sentence classification in 15 languages. Here is Thai subset of XNLI.
31
+ - <a href="https://huggingface.co/datasets/cambridgeltl/xcopa">XCOPA</a>: XCOPA is a translation and reannotation of the English COPA to measuring commonsense across languages. Here is Thai subset of XCOPA.
32
+ - <a href="https://huggingface.co/datasets/pythainlp/wisesight_sentiment">Wisesight</a>: Wisesight sentiment analysis corpus is a social media messages in Thai language with sentiment label.
33
+ - NLG
34
+ - <a href="https://huggingface.co/datasets/csebuetnlp/xlsum">XLSum</a>: XLSum is a comprehensive and diverse dataset comprising 1.35 million professionally annotated article-summary pairs from BBC. Here is Thai subset of XLSum.
35
+ - <a href="https://huggingface.co/datasets/SEACrowd/flores200">Flores200</a>: FLORES is a benchmark dataset for machine translation between English and low-resource languages. Here is Thai subset of Flores200.
36
+ - <a href="https://huggingface.co/datasets/iapp/iapp_wiki_qa_squad">iapp Wiki QA Squad</a>: iapp Wiki QA Squad is an extractive question answering dataset from Thai Wikipedia articles.
37
+
38
+
39
+ Metrics Implementations
40
+ - BLEU is calculated using flores200 tokenizer using huggingface evaluate <a href="https://huggingface.co/spaces/evaluate-metric/sacrebleu">implementation</a>.
41
+ - ROUGEL is calculated using pythainlp newmm tokenizer using huggingface evaluate <a href="https://huggingface.co/spaces/evaluate-metric/rouge">implementation</a>.
42
+ - LLM as judge rating is judged by OpenAI gpt-4o-2024-05-13 using prompt specific by <a href="https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/data/judge_prompts.jsonl">lmsys MT-Bench</a>.
43
+
44
+ Reproducibility
45
+
46
+ To learn more about the evaluation pipeline and reproduce our results, check out the repository <a href="https://github.com/SEACrowd/seacrowd-experiments">seacrowd-experiments</a>.
47
+
48
+ Acknowledgements
49
+
50
+ We're grateful to community members for task and model submitting. To contribute, see submit tab.
51
+ """
52
+
53
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
54
+ CITATION_BUTTON_TEXT = r"""@misc{lovenia2024seacrowdmultilingualmultimodaldata,
55
+ title={SEACrowd: A Multilingual Multimodal Data Hub and Benchmark Suite for Southeast Asian Languages},
56
+ author={Holy Lovenia and Rahmad Mahendra and Salsabil Maulana Akbar and Lester James V. Miranda and Jennifer Santoso and Elyanah Aco and Akhdan Fadhilah and Jonibek Mansurov and Joseph Marvin Imperial and Onno P. Kampman and Joel Ruben Antony Moniz and Muhammad Ravi Shulthan Habibi and Frederikus Hudi and Railey Montalan and Ryan Ignatius and Joanito Agili Lopo and William Nixon and Börje F. Karlsson and James Jaya and Ryandito Diandaru and Yuze Gao and Patrick Amadeus and Bin Wang and Jan Christian Blaise Cruz and Chenxi Whitehouse and Ivan Halim Parmonangan and Maria Khelli and Wenyu Zhang and Lucky Susanto and Reynard Adha Ryanda and Sonny Lazuardi Hermawan and Dan John Velasco and Muhammad Dehan Al Kautsar and Willy Fitra Hendria and Yasmin Moslem and Noah Flynn and Muhammad Farid Adilazuarda and Haochen Li and Johanes Lee and R. Damanhuri and Shuo Sun and Muhammad Reza Qorib and Amirbek Djanibekov and Wei Qi Leong and Quyet V. Do and Niklas Muennighoff and Tanrada Pansuwan and Ilham Firdausi Putra and Yan Xu and Ngee Chia Tai and Ayu Purwarianti and Sebastian Ruder and William Tjhi and Peerat Limkonchotiwat and Alham Fikri Aji and Sedrick Keh and Genta Indra Winata and Ruochen Zhang and Fajri Koto and Zheng-Xin Yong and Samuel Cahyawijaya},
57
+ year={2024},
58
+ eprint={2406.10118},
59
+ archivePrefix={arXiv},
60
+ primaryClass={cs.CL},
61
+ url={https://arxiv.org/abs/2406.10118},
62
+ }"""
src/display/css_html_js.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
+ }
18
+
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
+ }
36
+
37
+ #search-bar {
38
+ padding: 0px;
39
+ }
40
+
41
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ table td:first-child,
43
+ table th:first-child {
44
+ max-width: 400px;
45
+ overflow: auto;
46
+ white-space: nowrap;
47
+ }
48
+
49
+ .tab-buttons button {
50
+ font-size: 20px;
51
+ }
52
+
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+
62
+ #scale-logo .download {
63
+ display: none;
64
+ }
65
+ #filter_type{
66
+ border: 0;
67
+ padding-left: 0;
68
+ padding-top: 0;
69
+ }
70
+ #filter_type label {
71
+ display: flex;
72
+ }
73
+ #filter_type label > span{
74
+ margin-top: var(--spacing-lg);
75
+ margin-right: 0.5em;
76
+ }
77
+ #filter_type label > .wrap{
78
+ width: 103px;
79
+ }
80
+ #filter_type label > .wrap .wrap-inner{
81
+ padding: 2px;
82
+ }
83
+ #filter_type label > .wrap .wrap-inner input{
84
+ width: 1px
85
+ }
86
+ #filter-columns-type{
87
+ border:0;
88
+ padding:0.5;
89
+ }
90
+ #filter-columns-size{
91
+ border:0;
92
+ padding:0.5;
93
+ }
94
+ #box-filter > .form{
95
+ border: 0
96
+ }
97
+ """
98
+
99
+ get_window_url_params = """
100
+ function(url_params) {
101
+ const params = new URLSearchParams(window.location.search);
102
+ url_params = Object.fromEntries(params);
103
+ return url_params;
104
+ }
105
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ def make_clickable_model(model_name):
6
+ link = f"https://huggingface.co/{model_name}"
7
+ return model_hyperlink(link, model_name)
8
+
9
+
10
+ def styled_error(error):
11
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
+
13
+
14
+ def styled_warning(warn):
15
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
+
17
+
18
+ def styled_message(message):
19
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
+
21
+
22
+ def has_no_nan_values(df, columns):
23
+ return df[columns].notna().all(axis=1)
24
+
25
+
26
+ def has_nan_values(df, columns):
27
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ import pandas as pd
3
+
4
+
5
+ def fields(raw_class):
6
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
7
+
8
+
9
+ # These classes are for user facing column names,
10
+ # to avoid having to change them all around the code
11
+ # when a modif is needed
12
+ @dataclass
13
+ class ColumnContent:
14
+ name: str
15
+ type: str
16
+ displayed_by_default: bool
17
+ hidden: bool = False
18
+ never_hidden: bool = False
19
+
20
+ ## Leaderboard columns
21
+ auto_eval_column_dict = []
22
+ # Init
23
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
24
+
25
+
26
+ # We use make dataclass to dynamically fill the scores from Tasks
27
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
28
+
29
+ ## For the queue columns in the submission tab
30
+ @dataclass(frozen=True)
31
+ class EvalQueueColumn: # Queue column
32
+ model = ColumnContent("model", "markdown", True)
33
+ revision = ColumnContent("revision", "str", True)
34
+ private = ColumnContent("private", "bool", True)
35
+ status = ColumnContent("status", "str", True)
36
+
37
+ ## All the model information that we might need
38
+ @dataclass
39
+ class ModelDetails:
40
+ name: str
41
+ display_name: str = ""
42
+ symbol: str = "" # emoji
43
+
44
+
45
+ # Column selection
46
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
47
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
48
+
49
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
50
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
51
+
52
+ NUMERIC_INTERVALS = {
53
+ "?": pd.Interval(-1, 0, closed="right"),
54
+ "~1.5": pd.Interval(0, 2, closed="right"),
55
+ "~3": pd.Interval(2, 4, closed="right"),
56
+ "~7": pd.Interval(4, 9, closed="right"),
57
+ "~13": pd.Interval(9, 20, closed="right"),
58
+ "~35": pd.Interval(20, 45, closed="right"),
59
+ "~60": pd.Interval(45, 70, closed="right"),
60
+ "70+": pd.Interval(70, 10000, closed="right"),
61
+ }
src/envs.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from huggingface_hub import HfApi
5
+
6
+ # Info to change for your repository
7
+ # ----------------------------------
8
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
9
+
10
+ OWNER = "kunato-lab" # Change to your org - don't forget to create a results and request dataset, with the correct format!
11
+ # ----------------------------------
12
+
13
+ REPO_ID = f"{OWNER}/leaderboard"
14
+ QUEUE_REPO = f"{OWNER}/requests"
15
+ RESULTS_REPO = f"{OWNER}/results"
16
+
17
+ # If you setup a cache later, just change HF_HOME
18
+ CACHE_PATH=os.getenv("HF_HOME", ".")
19
+
20
+ # Local caches
21
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
22
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
23
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
24
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
25
+
26
+ API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ import glob
3
+ import json
4
+ import os
5
+ from dataclasses import dataclass
6
+ import dateutil
7
+
8
+ from src.submission.check_validity import get_model_size
9
+ from src.envs import API
10
+ from src.display.formatting import model_hyperlink
11
+ from src.display.utils import AutoEvalColumn
12
+
13
+
14
+ DATASET_TO_NAME_MAPPING = {
15
+ 'wisesight_thai_sentiment_seacrowd_text': "Wisesight - SA",
16
+ 'xnli.tha_seacrowd_pairs': "XNLI Thai - NLI",
17
+ "xcopa_tha_seacrowd_qa": "XCOPA Thai - Reasoning",
18
+ 'belebele_tha_thai_seacrowd_qa': "Belebele - RC",
19
+ 'xl_sum_tha_seacrowd_t2t': "XLSum Summarization",
20
+ 'm3exam_tha_seacrowd_qa': 'M3Exam',
21
+ "flores200_eng_Latn_tha_Thai_seacrowd_t2t": 'Flores200 ENG->TH',
22
+ "flores200_tha_Thai_eng_Latn_seacrowd_t2t": 'Flores200 TH->ENG',
23
+ "iapp_squad_seacrowd_qa": "iapp Squad - QA",
24
+ "Writing": "MT-Bench: Writing",
25
+ 'Math': "MT-Bench: Math",
26
+ 'Coding': 'MT-Bench: Coding',
27
+ 'Extraction': 'MT-Bench: Extraction',
28
+ 'Reasoning': 'MT-Bench: Reasoning',
29
+ 'Roleplay': 'MT-Bench: Roleplay',
30
+ 'STEM': 'MT-Bench: STEM',
31
+ 'Social Science': 'MT-Bench: Social Science',
32
+ 'thaiexam_qa': "Thai Exam",
33
+
34
+ 'lr_sum_tha_seacrowd_t2t': '',
35
+ 'ntrex_128_eng-US_tha_seacrowd_t2t': '',
36
+ 'ntrex_128_tha_eng-US_seacrowd_t2t': '',
37
+ }
38
+
39
+ METRICS_TO_NAME_MAPPING = {
40
+ 'accuracy': "Acc",
41
+ 'ROUGE1': '',
42
+ 'ROUGE2': '',
43
+ 'ROUGEL': 'ROUGEL',
44
+ 'BLEU': '',
45
+ 'SacreBLEU': 'BLEU',
46
+ 'chrF++': '',
47
+ 'avg_rating': "Rating"
48
+ }
49
+
50
+ def _parse_value_for_metric(value, metric):
51
+ if metric.lower() == 'accuracy':
52
+ return float("{:.4f}".format(value)) * 100
53
+ else:
54
+ return float("{:.2f}".format(value))
55
+
56
+ def _get_model_size(model: str):
57
+ # Is the model info correctly filled?
58
+ try:
59
+ model_info = API.model_info(repo_id=model, revision="main")
60
+ model_size = get_model_size(model_info=model_info)
61
+ return model_size
62
+ except Exception:
63
+ return 0
64
+
65
+ FILTERED_ONLY_FIELD = ['params']
66
+
67
+
68
+ @dataclass
69
+ class EvalResult:
70
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
71
+ """
72
+ eval_name: str # org_model (uid)
73
+ full_model: str # org/model (path on hub)
74
+ org: str
75
+ model: str
76
+ results: dict
77
+ params: str
78
+ model_link: str = ""
79
+ date: str = "" # submission date of request file
80
+
81
+ @classmethod
82
+ def init_from_json_file(self, json_filepath):
83
+ """Inits the result from the specific model result file"""
84
+ with open(json_filepath) as fp:
85
+ data = json.load(fp)
86
+
87
+ config = data.get("config")
88
+
89
+
90
+ # Get model and org
91
+ org_and_model = config.get("model_name", config.get("model_args", None))
92
+ org_and_model = org_and_model.split("/", 1)
93
+
94
+ if len(org_and_model) == 1:
95
+ org = None
96
+ model = org_and_model[0]
97
+ result_key = f"{model}"
98
+ else:
99
+ org = org_and_model[0]
100
+ model = org_and_model[1]
101
+ result_key = f"{org}_{model}"
102
+ full_model = "/".join(org_and_model)
103
+ model_link = config.get('model_link', '')
104
+ params = config.get('params', '')
105
+
106
+ # Extract results available in this file (some results are split in several files)
107
+ results = {}
108
+ for k in data['results'].keys():
109
+ results[k] = data['results'][k]
110
+ return self(
111
+ eval_name=result_key,
112
+ full_model=full_model,
113
+ model_link=model_link,
114
+ org=org,
115
+ model=model,
116
+ results=results,
117
+ params=params,
118
+ )
119
+
120
+ def update_with_request_file(self, requests_path):
121
+ """Finds the relevant request file for the current model and updates info with it"""
122
+ request_file = get_request_file_for_model(requests_path, self.full_model)
123
+
124
+ try:
125
+ with open(request_file, "r") as f:
126
+ request = json.load(f)
127
+ self.date = request.get("submitted_time", "")
128
+ except Exception:
129
+ print(f"Could not find request file for {self.org}/{self.model}")
130
+
131
+ def to_dict(self):
132
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
133
+ data_dict = OrderedDict({
134
+ AutoEvalColumn.model.name: model_hyperlink(self.model_link, self.full_model),
135
+ 'params': _get_model_size(self.model_link) if self.params == '' else self.params
136
+ })
137
+ for key in self.results.keys():
138
+ value = self.results[key]
139
+ for key2 in value.keys():
140
+ dataset_name_mapped = DATASET_TO_NAME_MAPPING[key]
141
+ metric_name_mapped = METRICS_TO_NAME_MAPPING[key2]
142
+ if dataset_name_mapped.strip() == '':
143
+ continue
144
+ if metric_name_mapped.strip() == '':
145
+ continue
146
+ data_dict[dataset_name_mapped + f'({metric_name_mapped})'] = _parse_value_for_metric(value[key2], key2)
147
+ # TODO add Average ⬆️
148
+
149
+ avg = []
150
+ for key, value in data_dict.items():
151
+ if isinstance(value, float):
152
+ avg.append(data_dict[key])
153
+ data_dict['Average ⬆️'] = float("{:.2f}".format(sum(avg) / len(avg)))
154
+ for k in list(data_dict.keys()):
155
+ if k not in [AutoEvalColumn.model.name, 'Average ⬆️']:
156
+ data_dict.move_to_end(k)
157
+
158
+ return data_dict
159
+
160
+
161
+ def get_request_file_for_model(requests_path, model_name):
162
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
163
+ request_files = os.path.join(
164
+ requests_path,
165
+ f"{model_name}_eval_request_*.json",
166
+ )
167
+ request_files = glob.glob(request_files)
168
+
169
+ request_file = ""
170
+ request_files = sorted(request_files, reverse=True)
171
+ for tmp_request_file in request_files:
172
+ with open(tmp_request_file, "r") as f:
173
+ req_content = json.load(f)
174
+ if (
175
+ req_content["status"] in ["FINISHED"]
176
+ ):
177
+ request_file = tmp_request_file
178
+ return request_file
179
+
180
+
181
+ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
182
+ """From the path of the results folder root, extract all needed info for results"""
183
+ model_result_filepaths = []
184
+
185
+ for root, _, files in os.walk(results_path):
186
+ # We should only have json files in model results
187
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
188
+ continue
189
+
190
+ # Sort the files by date
191
+ try:
192
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
193
+ except dateutil.parser._parser.ParserError:
194
+ files = [files[-1]]
195
+
196
+ for file in files:
197
+ model_result_filepaths.append(os.path.join(root, file))
198
+
199
+ eval_results = {}
200
+ for model_result_filepath in model_result_filepaths:
201
+ # Creation of result
202
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
203
+
204
+ # Store results of same eval together
205
+ eval_name = eval_result.eval_name
206
+ if eval_name in eval_results.keys():
207
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
208
+ else:
209
+ eval_results[eval_name] = eval_result
210
+
211
+ results = []
212
+ for v in eval_results.values():
213
+ try:
214
+ v.to_dict() # we test if the dict version is complete
215
+ results.append(v)
216
+ except KeyError as e: # not all eval values present
217
+ raise e
218
+ continue
219
+ return results
src/pages/about.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.about import LLM_BENCHMARKS_TEXT
4
+
5
+ def show_about_page(index: int):
6
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=index):
7
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
src/pages/result_table.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, SearchColumns, SelectColumns, ColumnFilter
3
+ from src.leaderboard.read_evals import FILTERED_ONLY_FIELD
4
+ from src.envs import EVAL_RESULTS_PATH
5
+ from src.populate import get_leaderboard_df
6
+ from src.display.utils import (
7
+ AutoEvalColumn,
8
+ fields,
9
+ )
10
+
11
+ def show_result_page(root_path: str, title: str, index: int):
12
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH + f"/{root_path}")
13
+ leaderboard_df = original_df.copy()
14
+ with gr.TabItem(title, elem_id="llm-benchmark-tab-table", id=index):
15
+ return Leaderboard(
16
+ value=leaderboard_df,
17
+ datatype=[c.type for c in fields(AutoEvalColumn)],
18
+ select_columns=SelectColumns(
19
+ default_selection=[c for c in list(original_df.keys())],
20
+ cant_deselect=[c.name for c in fields(AutoEvalColumn)],
21
+ label="Select Columns to show:",
22
+ ),
23
+ hide_columns=FILTERED_ONLY_FIELD,
24
+ search_columns=SearchColumns(
25
+ primary_column=AutoEvalColumn.model.name,
26
+ secondary_columns=[],
27
+ placeholder="🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
28
+ label="Search",
29
+ ),
30
+ filter_columns=[
31
+ ColumnFilter(
32
+ 'params',
33
+ type="slider",
34
+ min=0.01,
35
+ max=150,
36
+ label="Select the number of parameters (B)",
37
+ ),
38
+ ],
39
+ bool_checkboxgroup_label="Hide models",
40
+ interactive=False,
41
+ )
src/pages/submit.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.display.utils import EVAL_COLS, EVAL_TYPES
2
+ from src.envs import EVAL_REQUESTS_PATH
3
+ from src.populate import get_evaluation_queue_df
4
+ from src.submission.submit import add_new_eval
5
+ import gradio as gr
6
+
7
+ def show_submit_page(index: int):
8
+ (
9
+ finished_eval_queue_df,
10
+ running_eval_queue_df,
11
+ pending_eval_queue_df,
12
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
13
+ with gr.TabItem("🚀 Submit! ", elem_id="llm-benchmark-tab-table", id=index):
14
+
15
+ with gr.Column():
16
+ with gr.Accordion(
17
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
18
+ open=False,
19
+ ):
20
+ with gr.Row():
21
+ finished_eval_table = gr.components.Dataframe(
22
+ value=finished_eval_queue_df,
23
+ headers=EVAL_COLS,
24
+ datatype=EVAL_TYPES,
25
+ row_count=5,
26
+ )
27
+
28
+ with gr.Accordion(
29
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
30
+ open=False,
31
+ ):
32
+ with gr.Row():
33
+ pending_eval_table = gr.components.Dataframe(
34
+ value=pending_eval_queue_df,
35
+ headers=EVAL_COLS,
36
+ datatype=EVAL_TYPES,
37
+ row_count=5,
38
+ )
39
+
40
+ with gr.Row():
41
+ gr.Markdown("# ✉️✨ Submit your model!", elem_classes="markdown-text")
42
+
43
+ with gr.Row():
44
+ with gr.Column():
45
+ model_name_textbox = gr.Textbox(label="Huggingface Model")
46
+ link_to_model_blog = gr.Textbox(label="Model release blog / Technical report")
47
+
48
+ submit_button = gr.Button("Submit Model")
49
+ submission_result = gr.Markdown()
50
+ submit_button.click(
51
+ add_new_eval,
52
+ [
53
+ model_name_textbox,
54
+ link_to_model_blog
55
+ ],
56
+ submission_result,
57
+ )
58
+
59
+ with gr.Row():
60
+ gr.Markdown('# ✉️✨ Submit your task <a href="https://github.com">here!</a>', elem_classes="markdown-text")
src/populate.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from src.display.utils import EvalQueueColumn
7
+ from src.leaderboard.read_evals import get_raw_eval_results
8
+
9
+
10
+ def get_leaderboard_df(results_path: str) -> pd.DataFrame:
11
+ """Creates a dataframe from all the individual experiment results"""
12
+ raw_data = get_raw_eval_results(results_path)
13
+ all_data_json = [v.to_dict() for v in raw_data]
14
+
15
+ df = pd.DataFrame.from_records(all_data_json)
16
+ df = df.round(decimals=2)
17
+ return raw_data, df
18
+
19
+
20
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
21
+ """Creates the different dataframes for the evaluation queues requestes"""
22
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
23
+ all_evals = []
24
+
25
+ for entry in entries:
26
+ if ".json" in entry:
27
+ file_path = os.path.join(save_path, entry)
28
+ with open(file_path) as fp:
29
+ data = json.load(fp)
30
+
31
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
32
+
33
+ all_evals.append(data)
34
+ elif ".md" not in entry:
35
+ # this is a folder
36
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
37
+ for sub_entry in sub_entries:
38
+ file_path = os.path.join(save_path, entry, sub_entry)
39
+ with open(file_path) as fp:
40
+ data = json.load(fp)
41
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
42
+ all_evals.append(data)
43
+
44
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
45
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
46
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
47
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
48
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
49
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
50
+ return df_finished[cols], df_running[cols], df_pending[cols]
src/submission/check_validity.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
+ from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
12
+
13
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
14
+ """Checks if the model card and license exist and have been filled"""
15
+ try:
16
+ card = ModelCard.load(repo_id)
17
+ except huggingface_hub.utils.EntryNotFoundError:
18
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
+
20
+ # Enforce license metadata
21
+ if card.data.license is None:
22
+ if not ("license_name" in card.data and "license_link" in card.data):
23
+ return False, (
24
+ "License not found. Please add a license to your model card using the `license` metadata or a"
25
+ " `license_name`/`license_link` pair."
26
+ )
27
+
28
+ # Enforce card content
29
+ if len(card.text) < 200:
30
+ return False, "Please add a description to your model card, it is too short."
31
+
32
+ return True, ""
33
+
34
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
+ """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
+ try:
37
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
+ if test_tokenizer:
39
+ try:
40
+ tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
+ except ValueError as e:
42
+ return (
43
+ False,
44
+ f"uses a tokenizer which is not in a transformers release: {e}",
45
+ None
46
+ )
47
+ except Exception as e:
48
+ return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
+ return True, None, config
50
+
51
+ except ValueError:
52
+ return (
53
+ False,
54
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
+ None
56
+ )
57
+
58
+ except Exception as e:
59
+ return False, "was not found on hub!", None
60
+
61
+
62
+ def get_model_size(model_info: ModelInfo):
63
+ """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
+ try:
65
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
+ except (AttributeError, TypeError):
67
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
+
69
+ size_factor = 1
70
+ model_size = size_factor * model_size
71
+ return model_size
72
+
73
+ def get_model_arch(model_info: ModelInfo):
74
+ """Gets the model architecture from the configuration"""
75
+ return model_info.config.get("architectures", "Unknown")
76
+
77
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
78
+ """Gather a list of already submitted models to avoid duplicates"""
79
+ depth = 1
80
+ file_names = []
81
+ users_to_submission_dates = defaultdict(list)
82
+
83
+ for root, _, files in os.walk(requested_models_dir):
84
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
+ if current_depth == depth:
86
+ for file in files:
87
+ if not file.endswith(".json"):
88
+ continue
89
+ with open(os.path.join(root, file), "r") as f:
90
+ info = json.load(f)
91
+ file_names.append(f"{info['model']}_{info['revision']}")
92
+
93
+ # Select organisation
94
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
95
+ continue
96
+ organisation, _ = info["model"].split("/")
97
+ users_to_submission_dates[organisation].append(info["submitted_time"])
98
+
99
+ return set(file_names), users_to_submission_dates
src/submission/submit.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+ from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
+ from src.submission.check_validity import (
8
+ already_submitted_models,
9
+ check_model_card,
10
+ get_model_size,
11
+ )
12
+
13
+ REQUESTED_MODELS = None
14
+ USERS_TO_SUBMISSION_DATES = None
15
+
16
+ def add_new_eval(
17
+ model: str,
18
+ link: str,
19
+ ):
20
+ global REQUESTED_MODELS
21
+ global USERS_TO_SUBMISSION_DATES
22
+ if not REQUESTED_MODELS:
23
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
24
+
25
+ user_name = ""
26
+ model_path = model
27
+ if "/" in model:
28
+ user_name = model.split("/")[0]
29
+ model_path = model.split("/")[1]
30
+
31
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
32
+
33
+
34
+ revision = "main"
35
+
36
+ # Is the model info correctly filled?
37
+ try:
38
+ model_info = API.model_info(repo_id=model, revision=revision)
39
+ except Exception:
40
+ return styled_error("Could not get your model information. Please fill it up properly.")
41
+
42
+ model_size = get_model_size(model_info=model_info)
43
+
44
+ # Were the model card and license filled?
45
+ try:
46
+ license = model_info.cardData["license"]
47
+ except Exception:
48
+ return styled_error("Please select a license for your model")
49
+
50
+ modelcard_OK, error_msg = check_model_card(model)
51
+ if not modelcard_OK:
52
+ return styled_error(error_msg)
53
+
54
+ # Seems good, creating the eval
55
+ print("Adding new eval")
56
+
57
+ eval_entry = {
58
+ "model": model,
59
+ "revision": revision,
60
+ "link": link,
61
+ "status": "PENDING",
62
+ "submitted_time": current_time,
63
+ "params": model_size,
64
+ "license": license,
65
+ "private": False,
66
+ }
67
+
68
+ # Check for duplicate submission
69
+ if f"{model}_{revision}" in REQUESTED_MODELS:
70
+ return styled_warning("This model has been already submitted.")
71
+
72
+ print("Creating eval file")
73
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
74
+ os.makedirs(OUT_DIR, exist_ok=True)
75
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False.json"
76
+
77
+ with open(out_path, "w") as f:
78
+ f.write(json.dumps(eval_entry))
79
+
80
+ print("Uploading eval file")
81
+ API.upload_file(
82
+ path_or_fileobj=out_path,
83
+ path_in_repo=out_path.split("eval-queue/")[1],
84
+ repo_id=QUEUE_REPO,
85
+ repo_type="dataset",
86
+ commit_message=f"Add {model} to eval queue",
87
+ )
88
+
89
+ # Remove the local file
90
+ os.remove(out_path)
91
+
92
+ return styled_message(
93
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
94
+ )