From 81605245c1dba897856fa45276d1c2e8d2888326 Mon Sep 17 00:00:00 2001 From: Zicheng Zhang <58689334+zzc-1998@users.noreply.github.com> Date: Mon, 30 Oct 2023 15:27:25 +0800 Subject: [PATCH] Update README.md --- leaderboards/README.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/leaderboards/README.md b/leaderboards/README.md index daba7f2..ffb3d37 100644 --- a/leaderboards/README.md +++ b/leaderboards/README.md @@ -31,7 +31,7 @@ _version_: v1.0.1.1015wip; _Timeliness_: Updated on 30th Oct. About the partition of `dev` and `test` subsets, please see [our dataset release notes](../data_release/). As some models excel on original testing pipeline while some others perform better under PPL-based testing, we maintain two leaderboards for two different testing methods. See [examples](../example_code_for_idefics) for their different settings. ### Original Testing Pipeline -- 13 models tested +- 14 models tested - via Multi-Choice Questions #### Accuracies on Open-set (`dev`) @@ -137,20 +137,20 @@ Abbreviations for dimensions: *comp: completeness, prec: precision, rele: releva | **Model Name** | p_{0, comp} | p_{0, comp} | p_{2, comp} | s_{compl} | p_{0, prec} | p_{0, prec} | p_{2, prec} | s_{prec} | p_{0, rele} | p_{0, rele} | p_{2, rele} | s_{rele} | s_{sum} | | - | - | - | - | - | - | - | - | - | - | - | - | - | - | -| idefics | 28.91% | 59.16% | 11.93% | 0.83/2.00 | 34.68% | 27.86% | 37.46% | 1.03/2.00 | 3.90% | 59.66% | 36.44% | 1.33/2.00 | 3.18/6.00 | -| instructblip_t5 | 23.16% | 66.44% | 10.40% | 0.87/2.00 | 34.85% | 26.03% | 39.12% | 1.04/2.00 | 14.71% | 59.87% | 25.42% | 1.11/2.00 | 3.02/6.00 | -| instructblip_vicuna | 29.73% | 61.47% | 8.80% | 0.79/2.00 | 27.84% | 23.52% | 48.65% | 1.21/2.00 | 27.40% | 61.29% | 11.31% | 0.84/2.00 | 2.84/6.00 | -| internlm_xcomposer_vl | 19.19% | 70.39% | 10.42% | 0.91/2.00 | 28.23% | 32.38% | 39.39% | 1.11/2.00 | 8.87% | 65.38% | 25.76% | 1.17/2.00 | 3.19/6.00 | -| kosmos_2 | 8.76% | 70.91% | 20.33% | **1.12/2.00** (rank 1) | 29.45% | 34.75% | 35.81% | 1.06/2.00 | 0.16% | 14.77% | 85.06% | **1.85/2.00** (rank 1) | **4.03/6.00** (rank 1) | -| llama_adapter_v2 | 30.44% | 53.99% | 15.57% | 0.85/2.00 | 29.41% | 25.79% | 44.80% | 1.15/2.00 | 1.50% | 52.75% | 45.75% | 1.44/2.00 | 3.45/6.00 | -| llava_v1.5 | 27.68% | 53.78% | 18.55% | 0.91/2.00 | 25.45% | 21.47% | 53.08% | **1.28/2.00** (rank 1) | 6.31% | 58.75% | 34.94% | 1.29/2.00 | 3.47/6.00 | -| llava_v1 | 34.10% | 40.52% | 25.39% | 0.91/2.00 | 30.02% | 15.15% | 54.83% | 1.25/2.00 | 1.06% | 38.03% | 60.91% | 1.60/2.00 | 3.76/6.00 | -| minigpt4_13b | 34.01% | 32.15% | 33.85% | 1.00/2.00 | 29.20% | 15.27% | 55.53% | 1.26/2.00 | 6.88% | 45.65% | 47.48% | 1.41/2.00 | 3.67/6.00 | -| mplug_owl | 28.28% | 37.69% | 34.03% | 1.06/2.00 | 26.75% | 18.18% | 55.07% | **1.28/2.00** (rank 1) | 3.03% | 33.82% | 63.15% | 1.60/2.00 | 3.94/6.00 | -| otter_v1 | 22.38% | 59.36% | 18.25% | 0.96/2.00 | 40.68% | 35.99% | 23.33% | 0.83/2.00 | 1.95% | 13.20% | 84.85% | 1.83/2.00 | 3.61/6.00 | -| qwen_vl | 26.34% | 49.13% | 24.53% | 0.98/2.00 | 50.62% | 23.44% | 25.94% | 0.75/2.00 | 0.73% | 35.56% | 63.72% | 1.63/2.00 | 3.36/6.00 | -| shikra | 21.14% | 68.33% | 10.52% | 0.89/2.00 | 30.33% | 28.30% | 41.37% | 1.11/2.00 | 1.14% | 64.36% | 34.50% | 1.33/2.00 | 3.34/6.00 | -| visualglm | 30.75% | 56.64% | 12.61% | 0.82/2.00 | 38.64% | 26.18% | 35.18% | 0.97/2.00 | 6.14% | 67.15% | 26.71% | 1.21/2.00 | 2.99/6.00 | +| LLaVA-v1.5 (Vicuna-v1.5-13B) | 27.68% | 53.78% | 18.55% | 0.91/2.00 | 25.45% | 21.47% | 53.08% | 1.28/2.00 | 6.31% | 58.75% | 34.94% | 1.29/2.00 | 3.47/6.00 | +| InternLM-XComposer (InternLM) | 19.94% | 51.82% | 28.24% | 1.08/2.00 | 22.59% | 28.99% | 48.42% | 1.26/2.00 | 1.05% | 10.62% | 88.32% | 1.87/2.00 | 4.21/6.00 | +| IDEFICS-Instruct (LLaMA-7B) | 28.91% | 59.16% | 11.93% | 0.83/2.00 | 34.68% | 27.86% | 37.46% | 1.03/2.00 | 3.90% | 59.66% | 36.44% | 1.33/2.00 | 3.18/6.00 | +| Qwen-VL (QwenLM) | 26.34% | 49.13% | 24.53% | 0.98/2.00 | 50.62% | 23.44% | 25.94% | 0.75/2.00 | 0.73% | 35.56% | 63.72% | 1.63/2.00 | 3.36/6.00 | +| Shikra (Vicuna-7B) | 21.14% | 68.33% | 10.52% | 0.89/2.00 | 30.33% | 28.30% | 41.37% | 1.11/2.00 | 1.14% | 64.36% | 34.50% | 1.33/2.00 | 3.34/6.00 | +| Otter-v1 (MPT-7B) | 22.38% | 59.36% | 18.25% | 0.96/2.00 | 40.68% | 35.99% | 23.33% | 0.83/2.00 | 1.95% | 13.2% | 84.85% | 1.83/2.00 | 3.61/6.00 | +| Kosmos-2 | 8.76% | 70.91% | 20.33% | 1.12/2.00 | 29.45% | 34.75% | 35.81% | 1.06/2.00 | 0.16% | 14.77% | 85.06% | 1.85/2.00 | 4.03/6.00 | +| InstructBLIP (Flan-T5-XL) | 23.16% | 66.44% | 10.40% | 0.87/2.00 | 34.85% | 26.03% | 39.12% | 1.04/2.00 | 14.71% | 59.87% | 25.42% | 1.11/2.00 | 3.02/6.00 | +| InstructBLIP (Vicuna-7B) | 29.73% | 61.47% | 8.80% | 0.79/2.00 | 27.84% | 23.52% | 48.65% | 1.21/2.00 | 27.40% | 61.29% | 11.31% | 0.84/2.00 | 2.84/6.00 | +| VisualGLM-6B (GLM-6B) | 30.75% | 56.64% | 12.61% | 0.82/2.00 | 38.64% | 26.18% | 35.18% | 0.97/2.00 | 6.14% | 67.15% | 26.71% | 1.21/2.00 | 2.99/6.00 | +| mPLUG-Owl (LLaMA-7B) | 28.28% | 37.69% | 34.03% | 1.06/2.00 | 26.75% | 18.18% | 55.07% | 1.28/2.00 | 3.03% | 33.82% | 63.15% | 1.6/2.00 | 3.94/6.00 | +| LLaMA-Adapter-V2 | 30.44% | 53.99% | 15.57% | 0.85/2.00 | 29.41% | 25.79% | 44.8% | 1.15/2.00 | 1.50% | 52.75% | 45.75% | 1.44/2.00 | 3.45/6.00 | +| LLaVA-v1 (Vicuna-13B) | 34.10% | 40.52% | 25.39% | 0.91/2.00 | 30.02% | 15.15% | 54.83% | 1.25/2.00 | 1.06% | 38.03% | 60.91% | 1.6/2.00 | 3.76/6.00 | +| MiniGPT-4 (Vicuna-13B) | 34.01% | 32.15% | 33.85% | 1.00/2.00 | 29.20% | 15.27% | 55.53% | 1.26/2.00 | 6.88% | 45.65% | 47.48% | 1.41/2.00 | 3.67/6.00 | ## Leaderboards for (A3): Assessment