Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
update leaderboard
Browse files- gen_table.py +2 -0
- meta_data.py +9 -0
gen_table.py
CHANGED
@@ -128,6 +128,8 @@ def BUILD_L2_DF(results, dataset):
|
|
128 |
df = df.sort_values('Final Score')
|
129 |
elif dataset == 'COCO_VAL':
|
130 |
df = df.sort_values('CIDEr')
|
|
|
|
|
131 |
else:
|
132 |
df = df.sort_values('Overall')
|
133 |
df = df.iloc[::-1]
|
|
|
128 |
df = df.sort_values('Final Score')
|
129 |
elif dataset == 'COCO_VAL':
|
130 |
df = df.sort_values('CIDEr')
|
131 |
+
elif dataset == 'VCR':
|
132 |
+
df = df.sort_values('Overall-Jaccard')
|
133 |
else:
|
134 |
df = df.sort_values('Overall')
|
135 |
df = df.iloc[::-1]
|
meta_data.py
CHANGED
@@ -227,3 +227,12 @@ LEADERBOARD_MD['BLINK'] = """
|
|
227 |
- BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”, but pose significant challenges for current multimodal large language models (LLMs).
|
228 |
- We evaluate BLINK on the test set of the benchmark, which contains 1901 visual questions in multi-choice format.
|
229 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
- BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”, but pose significant challenges for current multimodal large language models (LLMs).
|
228 |
- We evaluate BLINK on the test set of the benchmark, which contains 1901 visual questions in multi-choice format.
|
229 |
"""
|
230 |
+
|
231 |
+
LEADERBOARD_MD['VCR'] = """
|
232 |
+
## VCR Evaluation Results
|
233 |
+
|
234 |
+
- VCR challenges models to restore partially obscured text within images, leveraging pixel-level hints and contextual cues. Unlike traditional text-based tasks, VCR necessitates a synergistic understanding of visual image (VI), string text (ST), and text embedded in image (TEI). Our dataset is crafted using a pipeline that generates synthetic images from image-caption pairs with adjustable caption visibility, allowing for varied difficulty levels.
|
235 |
+
- We report the Jaccard / Exact Match score for VCR, evaluated on the 500 samples subsets of each track in VCR with VLMEvalKit.
|
236 |
+
- The evaluation results are officially provided by the VCR authors, thanks Tianyu Zhang for his help.
|
237 |
+
|
238 |
+
"""
|