KennyUTC commited on
Commit
c48f969
1 Parent(s): e77f84c

update leaderboard

Browse files
Files changed (2) hide show
  1. gen_table.py +2 -0
  2. meta_data.py +9 -0
gen_table.py CHANGED
@@ -128,6 +128,8 @@ def BUILD_L2_DF(results, dataset):
128
  df = df.sort_values('Final Score')
129
  elif dataset == 'COCO_VAL':
130
  df = df.sort_values('CIDEr')
 
 
131
  else:
132
  df = df.sort_values('Overall')
133
  df = df.iloc[::-1]
 
128
  df = df.sort_values('Final Score')
129
  elif dataset == 'COCO_VAL':
130
  df = df.sort_values('CIDEr')
131
+ elif dataset == 'VCR':
132
+ df = df.sort_values('Overall-Jaccard')
133
  else:
134
  df = df.sort_values('Overall')
135
  df = df.iloc[::-1]
meta_data.py CHANGED
@@ -227,3 +227,12 @@ LEADERBOARD_MD['BLINK'] = """
227
  - BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”, but pose significant challenges for current multimodal large language models (LLMs).
228
  - We evaluate BLINK on the test set of the benchmark, which contains 1901 visual questions in multi-choice format.
229
  """
 
 
 
 
 
 
 
 
 
 
227
  - BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”, but pose significant challenges for current multimodal large language models (LLMs).
228
  - We evaluate BLINK on the test set of the benchmark, which contains 1901 visual questions in multi-choice format.
229
  """
230
+
231
+ LEADERBOARD_MD['VCR'] = """
232
+ ## VCR Evaluation Results
233
+
234
+ - VCR challenges models to restore partially obscured text within images, leveraging pixel-level hints and contextual cues. Unlike traditional text-based tasks, VCR necessitates a synergistic understanding of visual image (VI), string text (ST), and text embedded in image (TEI). Our dataset is crafted using a pipeline that generates synthetic images from image-caption pairs with adjustable caption visibility, allowing for varied difficulty levels.
235
+ - We report the Jaccard / Exact Match score for VCR, evaluated on the 500 samples subsets of each track in VCR with VLMEvalKit.
236
+ - The evaluation results are officially provided by the VCR authors, thanks Tianyu Zhang for his help.
237
+
238
+ """