Spaces:

q-future
/

Q-Bench-Leaderboard

Running

App Files Files Community

teowu commited on Feb 9

Commit

5e9cb18

•

1 Parent(s): 2cf064e

initial A1 results

Browse files

Files changed (4) hide show

README.md +1 -1
app.py +38 -0
qbench_a1_single_dev.csv +25 -0
qbench_a1_single_test.csv +25 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Q Bench Leaderboard
-emoji: 🏃
 colorFrom: blue
 colorTo: yellow
 sdk: gradio

 ---
 title: Q Bench Leaderboard
+emoji: 📊
 colorFrom: blue
 colorTo: yellow
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import gradio as gr
+import pandas as pd
+block = gr.Blocks(title="Q-Bench Leaderboard")
+LEADERBORAD_INTRODUCTION = """# Q-Bench Leaderboard
+  <img style="width:40%" src="https://raw.githubusercontent.com/Q-Future/Q-Bench/master/logo.png">
+    *"How do multi-modaility LLMs perform on low-level computer vision?"*
+    🏆 Welcome to the leaderboard of the **Q-Bench**! *A Comprehensive Benchmark Suite for General-purpose Foundation Models on Low-level Vision*
+    <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
+    <a href="https://github.com/Q-Future/"><img src="https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fvqassessment%2FQ-Bench&count_bg=%23E97EBA&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false"/></a>
+    <a href="https://github.com/Q-Future/Q-Bench"><img src="https://img.shields.io/github/stars/Q-Future/Q-Bench"/></a>
+    <a href="https://arxiv.org/abs/2309.14181"><img src="https://img.shields.io/badge/Arxiv-2309:14181-red"/></a>
+    <a href="https://github.com/Q-Future/Q-Bench/releases/tag/v1.0.1.1014datarelease"><img src="https://img.shields.io/badge/Data-Release-green"></a>
+    <a href="https://github.com/Q-Future/Q-Instruct"><img src="https://img.shields.io/badge/Awesome-QInstruct-orange"/></a>
+   </div>
+    - **Low-level Visual Perception (A1):** Open-range multi-choice questions on low-level visual perception. Dataset: [LLVisionQA](https://huggingface.co/datasets/teowu/LLVisionQA-QBench)
+    - **Low-level Visual Description (A2):** Detailed description on low-level visual attributes. Dataset: [LLDescribe](https://huggingface.co/datasets/teowu/LLDescribe-QBench)
+    - **Visual Quality Assessment (A3):** MLLMs can give a *precise visual quality score* via *logprobs*!
+    Right now we only include results validated in our paper. We will allow user submission soon.
+    """
+with block:
+    gr.Markdown(
+        LEADERBORAD_INTRODUCTION
+    )
+    with gr.Tab("Perception-A1-dev"):
+        gr.DataFrame(pd.read_csv("qbench_a1_single_dev.csv"))
+    with gr.Tab("Perception-A1-test"):
+        gr.DataFrame(pd.read_csv("qbench_a1_single_test.csv"))
+block.launch(share=True)

qbench_a1_single_dev.csv ADDED Viewed

	@@ -0,0 +1,25 @@

+Model (variant),Yes-or-No,What,How,Distortion,Other,In-context Distortion,In-context Other,Overall
+InfiMM (Zephyr-7B),57.45,57.96,44.62,47.27,57.17,49.67,64.08,53.37
+Emu2-Chat (LLaMA-33B),71.81,67.25,56.18,64.78,63.19,63.48,72.24,65.28
+Fuyu-8B (Persimmon-8B),53.33,43.7,38.0,40.81,47.4,45.45,49.23,45.05
+BakLLava (Mistral-7B),66.0,56.16,51.12,51.15,61.57,53.72,72.0,57.48
+SPHINX,74.18,68.81,62.07,63.62,71.76,66.12,76.33,68.56
+mPLUG-Owl2 (LLaMA-7B),72.18,57.96,56.19,56.68,69.21,53.29,72.65,61.61
+LLaVA-v1.5 (Vicuna-v1.5-7B),66.36,58.19,50.51,49.42,65.74,54.61,70.61,58.66
+LLaVA-v1.5 (Vicuna-v1.5-13B),65.27,64.38,56.59,56.03,67.13,61.18,67.35,62.14
+InternLM-XComposer-VL (InternLM),69.45,65.27,60.85,61.67,70.14,56.91,75.1,65.35
+IDEFICS-Instruct (LLaMA-7B),56.18,44.69,44.02,42.8,54.17,44.74,56.33,48.7
+Qwen-VL (QwenLM),63.09,58.19,56.39,50.58,62.73,57.89,73.88,59.4
+Shikra (Vicuna-7B),65.64,47.35,49.09,48.83,59.49,50.0,64.08,54.65
+Otter-v1 (MPT-7B),57.09,40.71,39.55,42.22,49.31,44.08,52.65,46.35
+InstructBLIP (Flan-T5-XL),67.64,59.96,55.98,56.23,65.51,58.22,69.39,61.47
+InstructBLIP (Vicuna-7B),71.64,52.65,43.81,48.64,62.5,55.59,64.9,56.72
+VisualGLM-6B (GLM-6B),60.18,54.2,46.25,51.75,54.4,53.62,57.14,53.78
+mPLUG-Owl (LLaMA-7B),66.0,54.87,44.02,51.36,55.09,54.28,65.71,55.38
+LLaMA-Adapter-V2,66.18,59.29,52.13,57.39,56.25,63.16,64.9,59.46
+LLaVA-v1 (Vicuna-13B),54.0,53.1,55.38,48.64,54.63,55.59,63.27,54.18
+MiniGPT-4 (Vicuna-13B),55.82,50.22,40.37,42.02,48.38,51.97,61.22,49.03
+Qwen-VL-Plus (Close-Source),73.77,69.47,53.88,66.21,65.72,63.81,68.75,66.04
+Qwen-VL-Max (Close-Source),75.6,79.43,66.09,73.39,74.08,71.0,76.92,73.63
+Gemini-Pro (Close-Source),68.8,73.74,62.34,66.3,71.34,63.91,73.09,68.16
+GPT-4V (Close-Source),76.85,79.17,67.52,73.53,76.18,72.83,76.47,74.51

qbench_a1_single_test.csv ADDED Viewed

	@@ -0,0 +1,25 @@

+Model (variant),Yes-or-No,What,How,Distortion,Other,In-context Distortion,In-context Other,Overall
+InfiMM (Zephyr-7B),61.31,56.61,49.58,47.79,62.05,51.71,67.68,56.05
+Emu2-Chat (LLaMA-33B),70.09,65.12,54.11,66.22,62.96,63.47,73.21,64.32
+Fuyu-8B (Persimmon-8B),62.22,35.79,36.62,41.07,49.4,45.89,49.04,45.75
+BakLLava (Mistral-7B),66.46,61.48,54.83,51.33,63.76,56.52,78.16,61.02
+SPHINX,74.45,65.5,62.13,59.11,73.26,66.09,77.56,67.69
+mPLUG-Owl2 (LLaMA-7B),72.26,55.53,58.64,52.59,71.36,58.9,73.0,62.68
+LLaVA-v1.5 (Vicuna-v1.5-7B),64.6,59.22,55.76,47.98,67.3,58.9,73.76,60.07
+LLaVA-v1.5 (Vicuna-v1.5-13B),64.96,64.86,54.12,53.55,66.59,58.9,71.48,61.4
+InternLM-XComposer-VL (InternLM),68.43,62.04,61.93,56.81,70.41,57.53,77.19,64.35
+IDEFICS-Instruct (LLaMA-7B),60.04,46.42,46.71,40.38,59.9,47.26,64.77,51.51
+Qwen-VL (QwenLM),65.33,60.74,58.44,54.13,66.35,58.22,73.0,61.67
+Shikra (Vicuna-7B),69.09,47.93,46.71,47.31,60.86,53.08,64.77,55.32
+Otter-v1 (MPT-7B),57.66,39.7,42.59,42.12,48.93,47.6,54.17,47.22
+InstructBLIP (Flan-T5-XL),69.53,59.0,56.17,57.31,65.63,56.51,71.21,61.94
+InstructBLIP (Vicuna-7B),70.99,51.41,43.0,45.0,63.01,57.19,64.39,55.85
+VisualGLM-6B (GLM-6B),61.31,53.58,44.03,48.56,54.89,55.48,57.79,53.31
+mPLUG-Owl (LLaMA-7B),72.45,54.88,47.53,49.62,63.01,62.67,66.67,58.93
+LLaMA-Adapter-V2,66.61,54.66,51.65,56.15,61.81,59.25,54.55,58.06
+LLaVA-v1 (Vicuna-13B),57.12,54.88,51.85,45.58,58.0,57.19,64.77,54.72
+MiniGPT-4 (Vicuna-13B),60.77,50.33,43.0,45.58,52.51,53.42,60.98,51.77
+Qwen-VL-Plus (Close-Source),75.74,73.25,57.33,64.88,73.24,68.67,70.56,68.93
+Qwen-VL-Max (Close-Source),73.2,81.02,68.39,70.84,74.57,73.11,80.44,73.9
+Gemini-Pro (Close-Source),71.26,71.39,65.59,67.3,73.04,65.88,73.6,69.46
+GPT-4V (Close-Source),77.72,78.39,66.45,71.01,71.07,79.36,78.91,74.1