teowu commited on
Commit
5e9cb18
β€’
1 Parent(s): 2cf064e

initial A1 results

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +38 -0
  3. qbench_a1_single_dev.csv +25 -0
  4. qbench_a1_single_test.csv +25 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Q Bench Leaderboard
3
- emoji: πŸƒ
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
 
1
  ---
2
  title: Q Bench Leaderboard
3
+ emoji: πŸ“Š
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ block = gr.Blocks(title="Q-Bench Leaderboard")
4
+
5
+ LEADERBORAD_INTRODUCTION = """# Q-Bench Leaderboard
6
+
7
+
8
+ <img style="width:40%" src="https://raw.githubusercontent.com/Q-Future/Q-Bench/master/logo.png">
9
+
10
+
11
+ *"How do multi-modaility LLMs perform on low-level computer vision?"*
12
+ πŸ† Welcome to the leaderboard of the **Q-Bench**! *A Comprehensive Benchmark Suite for General-purpose Foundation Models on Low-level Vision*
13
+ <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
14
+ <a href="https://github.com/Q-Future/"><img src="https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fvqassessment%2FQ-Bench&count_bg=%23E97EBA&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false"/></a>
15
+ <a href="https://github.com/Q-Future/Q-Bench"><img src="https://img.shields.io/github/stars/Q-Future/Q-Bench"/></a>
16
+ <a href="https://arxiv.org/abs/2309.14181"><img src="https://img.shields.io/badge/Arxiv-2309:14181-red"/></a>
17
+ <a href="https://github.com/Q-Future/Q-Bench/releases/tag/v1.0.1.1014datarelease"><img src="https://img.shields.io/badge/Data-Release-green"></a>
18
+ <a href="https://github.com/Q-Future/Q-Instruct"><img src="https://img.shields.io/badge/Awesome-QInstruct-orange"/></a>
19
+ </div>
20
+
21
+ - **Low-level Visual Perception (A1):** Open-range multi-choice questions on low-level visual perception. Dataset: [LLVisionQA](https://huggingface.co/datasets/teowu/LLVisionQA-QBench)
22
+ - **Low-level Visual Description (A2):** Detailed description on low-level visual attributes. Dataset: [LLDescribe](https://huggingface.co/datasets/teowu/LLDescribe-QBench)
23
+ - **Visual Quality Assessment (A3):** MLLMs can give a *precise visual quality score* via *logprobs*!
24
+
25
+ Right now we only include results validated in our paper. We will allow user submission soon.
26
+ """
27
+
28
+
29
+ with block:
30
+ gr.Markdown(
31
+ LEADERBORAD_INTRODUCTION
32
+ )
33
+ with gr.Tab("Perception-A1-dev"):
34
+ gr.DataFrame(pd.read_csv("qbench_a1_single_dev.csv"))
35
+ with gr.Tab("Perception-A1-test"):
36
+ gr.DataFrame(pd.read_csv("qbench_a1_single_test.csv"))
37
+
38
+ block.launch(share=True)
qbench_a1_single_dev.csv ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model (variant),Yes-or-No,What,How,Distortion,Other,In-context Distortion,In-context Other,Overall
2
+ InfiMM (Zephyr-7B),57.45,57.96,44.62,47.27,57.17,49.67,64.08,53.37
3
+ Emu2-Chat (LLaMA-33B),71.81,67.25,56.18,64.78,63.19,63.48,72.24,65.28
4
+ Fuyu-8B (Persimmon-8B),53.33,43.7,38.0,40.81,47.4,45.45,49.23,45.05
5
+ BakLLava (Mistral-7B),66.0,56.16,51.12,51.15,61.57,53.72,72.0,57.48
6
+ SPHINX,74.18,68.81,62.07,63.62,71.76,66.12,76.33,68.56
7
+ mPLUG-Owl2 (LLaMA-7B),72.18,57.96,56.19,56.68,69.21,53.29,72.65,61.61
8
+ LLaVA-v1.5 (Vicuna-v1.5-7B),66.36,58.19,50.51,49.42,65.74,54.61,70.61,58.66
9
+ LLaVA-v1.5 (Vicuna-v1.5-13B),65.27,64.38,56.59,56.03,67.13,61.18,67.35,62.14
10
+ InternLM-XComposer-VL (InternLM),69.45,65.27,60.85,61.67,70.14,56.91,75.1,65.35
11
+ IDEFICS-Instruct (LLaMA-7B),56.18,44.69,44.02,42.8,54.17,44.74,56.33,48.7
12
+ Qwen-VL (QwenLM),63.09,58.19,56.39,50.58,62.73,57.89,73.88,59.4
13
+ Shikra (Vicuna-7B),65.64,47.35,49.09,48.83,59.49,50.0,64.08,54.65
14
+ Otter-v1 (MPT-7B),57.09,40.71,39.55,42.22,49.31,44.08,52.65,46.35
15
+ InstructBLIP (Flan-T5-XL),67.64,59.96,55.98,56.23,65.51,58.22,69.39,61.47
16
+ InstructBLIP (Vicuna-7B),71.64,52.65,43.81,48.64,62.5,55.59,64.9,56.72
17
+ VisualGLM-6B (GLM-6B),60.18,54.2,46.25,51.75,54.4,53.62,57.14,53.78
18
+ mPLUG-Owl (LLaMA-7B),66.0,54.87,44.02,51.36,55.09,54.28,65.71,55.38
19
+ LLaMA-Adapter-V2,66.18,59.29,52.13,57.39,56.25,63.16,64.9,59.46
20
+ LLaVA-v1 (Vicuna-13B),54.0,53.1,55.38,48.64,54.63,55.59,63.27,54.18
21
+ MiniGPT-4 (Vicuna-13B),55.82,50.22,40.37,42.02,48.38,51.97,61.22,49.03
22
+ Qwen-VL-Plus (Close-Source),73.77,69.47,53.88,66.21,65.72,63.81,68.75,66.04
23
+ Qwen-VL-Max (Close-Source),75.6,79.43,66.09,73.39,74.08,71.0,76.92,73.63
24
+ Gemini-Pro (Close-Source),68.8,73.74,62.34,66.3,71.34,63.91,73.09,68.16
25
+ GPT-4V (Close-Source),76.85,79.17,67.52,73.53,76.18,72.83,76.47,74.51
qbench_a1_single_test.csv ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model (variant),Yes-or-No,What,How,Distortion,Other,In-context Distortion,In-context Other,Overall
2
+ InfiMM (Zephyr-7B),61.31,56.61,49.58,47.79,62.05,51.71,67.68,56.05
3
+ Emu2-Chat (LLaMA-33B),70.09,65.12,54.11,66.22,62.96,63.47,73.21,64.32
4
+ Fuyu-8B (Persimmon-8B),62.22,35.79,36.62,41.07,49.4,45.89,49.04,45.75
5
+ BakLLava (Mistral-7B),66.46,61.48,54.83,51.33,63.76,56.52,78.16,61.02
6
+ SPHINX,74.45,65.5,62.13,59.11,73.26,66.09,77.56,67.69
7
+ mPLUG-Owl2 (LLaMA-7B),72.26,55.53,58.64,52.59,71.36,58.9,73.0,62.68
8
+ LLaVA-v1.5 (Vicuna-v1.5-7B),64.6,59.22,55.76,47.98,67.3,58.9,73.76,60.07
9
+ LLaVA-v1.5 (Vicuna-v1.5-13B),64.96,64.86,54.12,53.55,66.59,58.9,71.48,61.4
10
+ InternLM-XComposer-VL (InternLM),68.43,62.04,61.93,56.81,70.41,57.53,77.19,64.35
11
+ IDEFICS-Instruct (LLaMA-7B),60.04,46.42,46.71,40.38,59.9,47.26,64.77,51.51
12
+ Qwen-VL (QwenLM),65.33,60.74,58.44,54.13,66.35,58.22,73.0,61.67
13
+ Shikra (Vicuna-7B),69.09,47.93,46.71,47.31,60.86,53.08,64.77,55.32
14
+ Otter-v1 (MPT-7B),57.66,39.7,42.59,42.12,48.93,47.6,54.17,47.22
15
+ InstructBLIP (Flan-T5-XL),69.53,59.0,56.17,57.31,65.63,56.51,71.21,61.94
16
+ InstructBLIP (Vicuna-7B),70.99,51.41,43.0,45.0,63.01,57.19,64.39,55.85
17
+ VisualGLM-6B (GLM-6B),61.31,53.58,44.03,48.56,54.89,55.48,57.79,53.31
18
+ mPLUG-Owl (LLaMA-7B),72.45,54.88,47.53,49.62,63.01,62.67,66.67,58.93
19
+ LLaMA-Adapter-V2,66.61,54.66,51.65,56.15,61.81,59.25,54.55,58.06
20
+ LLaVA-v1 (Vicuna-13B),57.12,54.88,51.85,45.58,58.0,57.19,64.77,54.72
21
+ MiniGPT-4 (Vicuna-13B),60.77,50.33,43.0,45.58,52.51,53.42,60.98,51.77
22
+ Qwen-VL-Plus (Close-Source),75.74,73.25,57.33,64.88,73.24,68.67,70.56,68.93
23
+ Qwen-VL-Max (Close-Source),73.2,81.02,68.39,70.84,74.57,73.11,80.44,73.9
24
+ Gemini-Pro (Close-Source),71.26,71.39,65.59,67.3,73.04,65.88,73.6,69.46
25
+ GPT-4V (Close-Source),77.72,78.39,66.45,71.01,71.07,79.36,78.91,74.1