Spaces:
Running
Running
shadowcun
commited on
Commit
•
7d9d01e
1
Parent(s):
d43e08a
Initial commit
Browse files- app.py +121 -0
- constants.py +47 -0
- file/result.csv +6 -0
- file/result1.csv +28 -0
- requirements.txt +70 -0
- src/__pycache__/utils_display.cpython-311.pyc +0 -0
- src/__pycache__/utils_display.cpython-38.pyc +0 -0
- src/__pycache__/utils_display.cpython-39.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-311.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-39.pyc +0 -0
- src/auto_leaderboard/model_metadata_type.py +30 -0
- src/utils_display.py +99 -0
- test.py +0 -0
app.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Adapted from the SEED-Bench Leaderboard by AILab-CVC
|
3 |
+
Source: https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard
|
4 |
+
"""
|
5 |
+
|
6 |
+
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
import pandas as pd
|
10 |
+
import json
|
11 |
+
import pdb
|
12 |
+
import tempfile
|
13 |
+
|
14 |
+
from constants import *
|
15 |
+
from src.auto_leaderboard.model_metadata_type import ModelType
|
16 |
+
|
17 |
+
global data_component, filter_component
|
18 |
+
|
19 |
+
|
20 |
+
def upload_file(files):
|
21 |
+
file_paths = [file.name for file in files]
|
22 |
+
return file_paths
|
23 |
+
|
24 |
+
def get_baseline_df():
|
25 |
+
df = pd.read_csv(CSV_DIR)
|
26 |
+
df = df.sort_values(by="Final Sum Score", ascending=False)
|
27 |
+
present_columns = MODEL_INFO + checkbox_group.value
|
28 |
+
df = df[present_columns]
|
29 |
+
print(df)
|
30 |
+
return df
|
31 |
+
|
32 |
+
def get_all_df():
|
33 |
+
df = pd.read_csv(CSV_DIR)
|
34 |
+
df = df.sort_values(by="Final Sum Score", ascending=False)
|
35 |
+
print(df)
|
36 |
+
return df
|
37 |
+
|
38 |
+
block = gr.Blocks()
|
39 |
+
|
40 |
+
|
41 |
+
with block:
|
42 |
+
gr.Markdown(
|
43 |
+
LEADERBORAD_INTRODUCTION
|
44 |
+
)
|
45 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
46 |
+
with gr.TabItem("🏅 EvalCrafter Benchmark", elem_id="evalcrafter-benchmark-tab-table", id=0):
|
47 |
+
|
48 |
+
gr.Markdown(
|
49 |
+
TABLE_INTRODUCTION
|
50 |
+
)
|
51 |
+
|
52 |
+
# selection for column part:
|
53 |
+
checkbox_group = gr.CheckboxGroup(
|
54 |
+
choices=TASK_INFO_v2,
|
55 |
+
value=AVG_INFO,
|
56 |
+
label="Select options",
|
57 |
+
interactive=True,
|
58 |
+
)
|
59 |
+
|
60 |
+
# 创建数据帧组件
|
61 |
+
# pdb.set_trace()
|
62 |
+
data_component = gr.components.Dataframe(
|
63 |
+
value=get_baseline_df,
|
64 |
+
headers=COLUMN_NAMES,
|
65 |
+
type="pandas",
|
66 |
+
datatype=DATA_TITILE_TYPE,
|
67 |
+
interactive=False,
|
68 |
+
visible=True,
|
69 |
+
)
|
70 |
+
|
71 |
+
def on_checkbox_group_change(selected_columns):
|
72 |
+
# pdb.set_trace()
|
73 |
+
selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
|
74 |
+
present_columns = MODEL_INFO + selected_columns
|
75 |
+
updated_data = get_all_df()[present_columns]
|
76 |
+
updated_data = updated_data.sort_values(by=present_columns[3], ascending=False)
|
77 |
+
updated_headers = present_columns
|
78 |
+
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
79 |
+
|
80 |
+
# pdb.set_trace()
|
81 |
+
filter_component = gr.components.Dataframe(
|
82 |
+
value=updated_data,
|
83 |
+
headers=updated_headers,
|
84 |
+
type="pandas",
|
85 |
+
datatype=update_datatype,
|
86 |
+
interactive=False,
|
87 |
+
visible=True,
|
88 |
+
)
|
89 |
+
# pdb.set_trace()
|
90 |
+
return filter_component.value
|
91 |
+
|
92 |
+
# 将复选框组关联到处理函数
|
93 |
+
checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
|
94 |
+
|
95 |
+
|
96 |
+
# table 2
|
97 |
+
with gr.TabItem("📝 About", elem_id="evalcrafter-benchmark-tab-table", id=2):
|
98 |
+
gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
|
99 |
+
|
100 |
+
|
101 |
+
with gr.Row():
|
102 |
+
data_run = gr.Button("Refresh")
|
103 |
+
data_run.click(
|
104 |
+
get_baseline_df, outputs=data_component
|
105 |
+
)
|
106 |
+
|
107 |
+
gr.Markdown(r"""
|
108 |
+
Please cite this paper if you find it useful ♥️:
|
109 |
+
|
110 |
+
```bibtex
|
111 |
+
@inproceedings{Liu2023EvalCrafterBA,
|
112 |
+
title={EvalCrafter: Benchmarking and Evaluating Large Video Generation Models},
|
113 |
+
author={Yaofang Liu and Xiaodong Cun and Xuebo Liu and Xintao Wang and Yong Zhang and Haoxin Chen and Yang Liu and Tieyong Zeng and Raymond Chan and Ying Shan},
|
114 |
+
year={2023},
|
115 |
+
url={https://api.semanticscholar.org/CorpusID:264172222}
|
116 |
+
}
|
117 |
+
```
|
118 |
+
""")
|
119 |
+
# block.load(get_baseline_df, outputs=data_title)
|
120 |
+
|
121 |
+
block.launch(share=False)
|
constants.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# this is .py for store constants
|
2 |
+
MODEL_INFO = ['Models', 'Ver.','Abilities']
|
3 |
+
TASK_INFO = [ 'Resolution', 'FPS', 'Open Source', 'Length', 'Speed', 'Motion', 'Camera', 'Final Sum Score', 'Motion Quality', 'Text-Video Alignment', 'Visual Quality', 'Temporal Consistency']
|
4 |
+
TASK_INFO_v2 = ['Final Sum Score', 'Motion Quality', 'Text-Video Alignment', 'Visual Quality', 'Temporal Consistency', 'Resolution', 'FPS', 'Open Source', 'Length', 'Speed', 'Motion', 'Camera']
|
5 |
+
|
6 |
+
AVG_INFO = ['Final Sum Score', 'Motion Quality', 'Text-Video Alignment', 'Visual Quality', 'Temporal Consistency']
|
7 |
+
DATA_TITILE_TYPE = ["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
|
8 |
+
CSV_DIR = "./file/result.csv"
|
9 |
+
|
10 |
+
# COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
11 |
+
COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
|
12 |
+
|
13 |
+
DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192]
|
14 |
+
|
15 |
+
|
16 |
+
LEADERBORAD_INTRODUCTION = """# EvalCrafter Leaderboard 🏆
|
17 |
+
|
18 |
+
Welcome to the cutting-edge leaderboard for text-to-video generation, where we meticulously evaluate state-of-the-art generative models using our comprehensive framework, ensuring high-quality results that align with user opinions. Join us in this exciting journey towards excellence! 🛫
|
19 |
+
|
20 |
+
More methods will be evalcrafted soon, stay tunned ❤️ Join our evaluation by sending an email 📧 ([email protected])! You may also read the [EvalCrafter paper](https://arxiv.org/abs/2310.11440) for more detailed information 🤗
|
21 |
+
"""
|
22 |
+
|
23 |
+
TABLE_INTRODUCTION = """In the table below, we summarize each dimension performance of all the models. """
|
24 |
+
|
25 |
+
LEADERBORAD_INFO = """
|
26 |
+
The vision and language generative models have been overgrown in recent years. For video generation,
|
27 |
+
various open-sourced models and public-available services are released for generating high-visual quality videos.
|
28 |
+
However, these methods often use a few academic metrics, \eg, FVD or IS, to evaluate the performance. We argue that
|
29 |
+
it is hard to judge the large conditional generative models from the simple metrics since these models are often trained
|
30 |
+
on very large datasets with multi-aspect abilities. Thus, we propose a new framework and pipeline to exhaustively evaluate
|
31 |
+
the performance of the generated videos. To achieve this, we first conduct a new prompt list for text-to-video generation
|
32 |
+
by analyzing the real-world prompt list with the help of the large language model. Then, we evaluate the state-of-the-art video
|
33 |
+
generative models on our carefully designed benchmarks, in terms of visual qualities, content qualities, motion qualities, and
|
34 |
+
text-caption alignment with around 18 objective metrics. To obtain the final leaderboard of the models, we also fit a series of
|
35 |
+
coefficients to align the objective metrics to the users' opinions. Based on the proposed opinion alignment method, our final score
|
36 |
+
shows a higher correlation than simply averaging the metrics, showing the effectiveness of the proposed evaluation method.
|
37 |
+
"""
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
42 |
+
CITATION_BUTTON_TEXT = r"""@inproceedings{Liu2023EvalCrafterBA,
|
43 |
+
title={EvalCrafter: Benchmarking and Evaluating Large Video Generation Models},
|
44 |
+
author={Yaofang Liu and Xiaodong Cun and Xuebo Liu and Xintao Wang and Yong Zhang and Haoxin Chen and Yang Liu and Tieyong Zeng and Raymond Chan and Ying Shan},
|
45 |
+
year={2023},
|
46 |
+
url={https://api.semanticscholar.org/CorpusID:264172222}
|
47 |
+
}"""
|
file/result.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Ver.,Abilities,Resolution,FPS,Open Source,Length,Speed,Motion,Camera,Final Sum Score,Motion Quality,Text-Video Alignment,Visual Quality,Temporal Consistency
|
2 |
+
ModelScope-XL,23.08,I2V & V2V,1280x720,8,✓,4s,8 min+,-,-,221 ,59.41,47.22,55.23,59.31
|
3 |
+
ZeroScope,23.06,T2V & V2V,1024x576,8,✓,4s,3 min,-,-,218 ,54.26,46.18,56.37,61.19
|
4 |
+
Floor33 Pictures,23.08,T2V,1280x720,8,-,2s,4 min,-,-,219 ,51.97,51.29,59.53,56.36
|
5 |
+
PikaLab,23.09,I2V OR T2V,1088x640,24,-,3s,1 min,✓,✓,245 ,57.74,54.11,63.52,69.35
|
6 |
+
Gen2,23.09,I2V OR T2V,896x512,24,-,4s,1 min,✓,✓,252 ,62.53,52.3,67.35,69.71
|
file/result1.csv
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model Type,Model,Language Model,Avg. All,Avg. Img,Avg. Video,Scene Understanding,Instance Identity,Instance Attributes,Instance Localization,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Action Recognition,Action Prediction,Procedure Understanding
|
2 |
+
LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,27.7,27.3,28.6,23,29,32.8,31.8,20.5,31.8,33,18.2,19.4,23.2,34.9,25.4
|
3 |
+
LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,28.5,28.2,29.5,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,27.3,34.5,23.8
|
4 |
+
LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,26.8,26.6,27.3,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37,9,33,23.1,26.2
|
5 |
+
ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,46.4,49.7,36.7,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,32.6,47.5,24
|
6 |
+
ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,52.7,57.8,38.3,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,33.1,49.1,27.1
|
7 |
+
ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,53.4,58.8,38.1,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,34.5,49.6,23.1
|
8 |
+
ImageLLM,[LLaVA](https://github.com/haotian-liu/LLaVA),Vicuna-13B,61.6,68.2,42.7,74.9,71.3,68.9,63.5,61.3,51.4,73.2,77,60.5,48.9,41.1,36.6
|
9 |
+
ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,42.8,47.4,29.9,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,38.2,24.5,27.1
|
10 |
+
ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,39.1,41.8,31.4,51.9,44.1,39.9,36.1,33.7,36.4,32,53.2,30.6,39.5,24.3,31.9
|
11 |
+
ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,33.2,34.5,29.2,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,36.9,25.8,24
|
12 |
+
ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,33.9,35.2,30.4,44.9,38.6,32.2,30.9,26.3,31.8,32,51.4,31.8,37.9,27.2,24.8
|
13 |
+
ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,39.7,42.9,30.6,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,36.8,29.2,23.8
|
14 |
+
ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,33.1,34.5,29.3,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20,37.2,25.4,24.2
|
15 |
+
ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,40.9,42.7,35.7,53.2,45.3,40,31.2,39.3,32.6,36.1,51.4,25.9,42.9,34.7,26.9
|
16 |
+
ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,32.7,35.2,25.8,45.2,38.5,29.3,33,29.7,35.5,39.2,52,24.7,38.6,18.5,19.6
|
17 |
+
ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,33.5,35.5,27.8,41.7,35.5,31.8,29.5,36.2,32,32,51.1,27.1,33.9,25.4,23
|
18 |
+
ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,34,37.9,23,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,26.7,17.9,26.5
|
19 |
+
ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,50,54.4,37.5,63.4,57.1,58.5,44,41.4,37.9,55.7,60.7,25.9,41.3,40.4,27
|
20 |
+
ImageLLM,[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,58.2,65.4,37.8,73.3,67.3,69.6,57.7,52.9,48.2,59.8,74.6,53.5,43.9,39.2,26.7
|
21 |
+
ImageLLM,[Qwen-VL](https://huggingface.co/Qwen/Qwen-VL),Qwen-7B,56.3,62.3,39.1,71.2,66.4,67.7,53.5,44.8,43.8,62.9,74.9,51.2,44.7,38.5,32
|
22 |
+
ImageLLM,[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,0,44.5,0,55.8,45.3,42.3,40.2,36.8,34.9,37.1,55.9,38.8,0,0,0
|
23 |
+
ImageLLM,[IDEFICS-80b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-65B,0,53.2,0,64,52.6,50.8,48.3,46.1,45.5,62.9,68,51.8,0,0,0
|
24 |
+
ImageLLM,[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,0,66.9,0,75,71.7,67.6,60.8,56.2,55.3,74.4,77,48.5,0,0,0
|
25 |
+
ImageLLM,[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13b,48.9,53.7,35.4,64.1,54.2,54.1,46.5,45.3,38.2,51.6,60.7,44.7,37.8,45.3,20.0
|
26 |
+
VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,37.6,39,33.7,47.1,43.8,34.9,40,32.8,34.6,42.3,50.5,17.7,34.9,36.4,27.3
|
27 |
+
VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,31.2,33.9,23.5,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,27.6,21.3,21.1
|
28 |
+
VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,30.3,32,25.4,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,31.3,23.2,20.7
|
requirements.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
anyio==3.6.2
|
6 |
+
APScheduler==3.10.1
|
7 |
+
async-timeout==4.0.2
|
8 |
+
attrs==23.1.0
|
9 |
+
certifi==2022.12.7
|
10 |
+
charset-normalizer==3.1.0
|
11 |
+
click==8.1.3
|
12 |
+
contourpy==1.0.7
|
13 |
+
cycler==0.11.0
|
14 |
+
datasets==2.12.0
|
15 |
+
entrypoints==0.4
|
16 |
+
fastapi==0.95.1
|
17 |
+
ffmpy==0.3.0
|
18 |
+
filelock==3.11.0
|
19 |
+
fonttools==4.39.3
|
20 |
+
frozenlist==1.3.3
|
21 |
+
fsspec==2023.4.0
|
22 |
+
gradio==3.27.0
|
23 |
+
gradio_client==0.1.3
|
24 |
+
h11==0.14.0
|
25 |
+
httpcore==0.17.0
|
26 |
+
httpx==0.24.0
|
27 |
+
huggingface-hub==0.13.4
|
28 |
+
idna==3.4
|
29 |
+
Jinja2==3.1.2
|
30 |
+
jsonschema==4.17.3
|
31 |
+
kiwisolver==1.4.4
|
32 |
+
linkify-it-py==2.0.0
|
33 |
+
markdown-it-py==2.2.0
|
34 |
+
MarkupSafe==2.1.2
|
35 |
+
matplotlib==3.7.1
|
36 |
+
mdit-py-plugins==0.3.3
|
37 |
+
mdurl==0.1.2
|
38 |
+
multidict==6.0.4
|
39 |
+
numpy==1.24.2
|
40 |
+
orjson==3.8.10
|
41 |
+
packaging==23.1
|
42 |
+
pandas==2.0.0
|
43 |
+
Pillow==9.5.0
|
44 |
+
plotly==5.14.1
|
45 |
+
pyarrow==11.0.0
|
46 |
+
pydantic==1.10.7
|
47 |
+
pydub==0.25.1
|
48 |
+
pyparsing==3.0.9
|
49 |
+
pyrsistent==0.19.3
|
50 |
+
python-dateutil==2.8.2
|
51 |
+
python-multipart==0.0.6
|
52 |
+
pytz==2023.3
|
53 |
+
pytz-deprecation-shim==0.1.0.post0
|
54 |
+
PyYAML==6.0
|
55 |
+
requests==2.28.2
|
56 |
+
semantic-version==2.10.0
|
57 |
+
six==1.16.0
|
58 |
+
sniffio==1.3.0
|
59 |
+
starlette==0.26.1
|
60 |
+
toolz==0.12.0
|
61 |
+
tqdm==4.65.0
|
62 |
+
transformers==4.28.1
|
63 |
+
typing_extensions==4.5.0
|
64 |
+
tzdata==2023.3
|
65 |
+
tzlocal==4.3
|
66 |
+
uc-micro-py==1.0.1
|
67 |
+
urllib3==1.26.15
|
68 |
+
uvicorn==0.21.1
|
69 |
+
websockets==11.0.1
|
70 |
+
yarl==1.8.2
|
src/__pycache__/utils_display.cpython-311.pyc
ADDED
Binary file (6.25 kB). View file
|
|
src/__pycache__/utils_display.cpython-38.pyc
ADDED
Binary file (4.31 kB). View file
|
|
src/__pycache__/utils_display.cpython-39.pyc
ADDED
Binary file (4.24 kB). View file
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-311.pyc
ADDED
Binary file (1.72 kB). View file
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc
ADDED
Binary file (1.26 kB). View file
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-39.pyc
ADDED
Binary file (1.2 kB). View file
|
|
src/auto_leaderboard/model_metadata_type.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
+
import glob
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from typing import Dict, List
|
7 |
+
|
8 |
+
from ..utils_display import AutoEvalColumn
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class ModelInfo:
|
12 |
+
name: str
|
13 |
+
symbol: str # emoji
|
14 |
+
|
15 |
+
model_type_symbols = {
|
16 |
+
"LLM": "🟢",
|
17 |
+
"ImageLLM": "🔶",
|
18 |
+
"VideoLLM": "⭕",
|
19 |
+
"Other": "🟦",
|
20 |
+
}
|
21 |
+
|
22 |
+
class ModelType(Enum):
|
23 |
+
PT = ModelInfo(name="LLM", symbol="🟢")
|
24 |
+
FT = ModelInfo(name="ImageLLM", symbol="🔶")
|
25 |
+
IFT = ModelInfo(name="VideoLLM", symbol="⭕")
|
26 |
+
RL = ModelInfo(name="Other", symbol="🟦")
|
27 |
+
|
28 |
+
def to_str(self, separator = " "):
|
29 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
30 |
+
|
src/utils_display.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
# These classes are for user facing column names, to avoid having to change them
|
4 |
+
# all around the code when a modif is needed
|
5 |
+
@dataclass
|
6 |
+
class ColumnContent:
|
7 |
+
name: str
|
8 |
+
type: str
|
9 |
+
displayed_by_default: bool
|
10 |
+
hidden: bool = False
|
11 |
+
|
12 |
+
def fields(raw_class):
|
13 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
14 |
+
|
15 |
+
@dataclass(frozen=True)
|
16 |
+
class AutoEvalColumn: # Auto evals column
|
17 |
+
model_type_symbol = ColumnContent("T", "str", True)
|
18 |
+
model = ColumnContent("Model", "markdown", True)
|
19 |
+
average = ColumnContent("Average ⬆️", "number", True)
|
20 |
+
arc = ColumnContent("ARC", "number", True)
|
21 |
+
hellaswag = ColumnContent("HellaSwag", "number", True)
|
22 |
+
mmlu = ColumnContent("MMLU", "number", True)
|
23 |
+
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
24 |
+
model_type = ColumnContent("Type", "str", False)
|
25 |
+
precision = ColumnContent("Precision", "str", False, True)
|
26 |
+
license = ColumnContent("Hub License", "str", False)
|
27 |
+
params = ColumnContent("#Params (B)", "number", False)
|
28 |
+
likes = ColumnContent("Hub ❤️", "number", False)
|
29 |
+
revision = ColumnContent("Model sha", "str", False, False)
|
30 |
+
dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
|
31 |
+
|
32 |
+
@dataclass(frozen=True)
|
33 |
+
class EloEvalColumn: # Elo evals column
|
34 |
+
model = ColumnContent("Model", "markdown", True)
|
35 |
+
gpt4 = ColumnContent("GPT-4 (all)", "number", True)
|
36 |
+
human_all = ColumnContent("Human (all)", "number", True)
|
37 |
+
human_instruct = ColumnContent("Human (instruct)", "number", True)
|
38 |
+
human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
|
39 |
+
|
40 |
+
|
41 |
+
@dataclass(frozen=True)
|
42 |
+
class EvalQueueColumn: # Queue column
|
43 |
+
model = ColumnContent("model", "markdown", True)
|
44 |
+
revision = ColumnContent("revision", "str", True)
|
45 |
+
private = ColumnContent("private", "bool", True)
|
46 |
+
precision = ColumnContent("precision", "bool", True)
|
47 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
48 |
+
status = ColumnContent("status", "str", True)
|
49 |
+
|
50 |
+
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|
51 |
+
|
52 |
+
|
53 |
+
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
54 |
+
VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
|
55 |
+
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
56 |
+
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
57 |
+
MODEL_PAGE = "https://huggingface.co/models"
|
58 |
+
LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
|
59 |
+
VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
|
60 |
+
ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
|
61 |
+
|
62 |
+
|
63 |
+
def model_hyperlink(link, model_name):
|
64 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
65 |
+
|
66 |
+
|
67 |
+
def make_clickable_model(model_name):
|
68 |
+
link = f"https://huggingface.co/{model_name}"
|
69 |
+
|
70 |
+
if model_name in LLAMAS:
|
71 |
+
link = LLAMA_LINK
|
72 |
+
model_name = model_name.split("/")[1]
|
73 |
+
elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
|
74 |
+
link = VICUNA_LINK
|
75 |
+
model_name = "stable-vicuna-13b"
|
76 |
+
elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
|
77 |
+
link = ALPACA_LINK
|
78 |
+
model_name = "alpaca-13b"
|
79 |
+
if model_name == "dolly-12b":
|
80 |
+
link = DOLLY_LINK
|
81 |
+
elif model_name == "vicuna-13b":
|
82 |
+
link = VICUNA_LINK
|
83 |
+
elif model_name == "koala-13b":
|
84 |
+
link = KOALA_LINK
|
85 |
+
elif model_name == "oasst-12b":
|
86 |
+
link = OASST_LINK
|
87 |
+
#else:
|
88 |
+
# link = MODEL_PAGE
|
89 |
+
|
90 |
+
return model_hyperlink(link, model_name)
|
91 |
+
|
92 |
+
def styled_error(error):
|
93 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
94 |
+
|
95 |
+
def styled_warning(warn):
|
96 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
97 |
+
|
98 |
+
def styled_message(message):
|
99 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
test.py
ADDED
File without changes
|