From 81605245c1dba897856fa45276d1c2e8d2888326 Mon Sep 17 00:00:00 2001
From: Zicheng Zhang <58689334+zzc-1998@users.noreply.github.com>
Date: Mon, 30 Oct 2023 15:27:25 +0800
Subject: [PATCH] Update README.md

---
 leaderboards/README.md | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/leaderboards/README.md b/leaderboards/README.md
index daba7f2..ffb3d37 100644
--- a/leaderboards/README.md
+++ b/leaderboards/README.md
@@ -31,7 +31,7 @@ _version_: v1.0.1.1015wip; _Timeliness_: Updated on 30th Oct.
 About the partition of `dev` and `test` subsets, please see [our dataset release notes](../data_release/). As some models excel on original testing pipeline while some others perform better under PPL-based testing, we maintain two leaderboards for two different testing methods. See [examples](../example_code_for_idefics) for their different settings.
 
 ### Original Testing Pipeline
-- 13 models tested
+- 14 models tested
 - via Multi-Choice Questions
 
 #### Accuracies on Open-set (`dev`)
@@ -137,20 +137,20 @@ Abbreviations for dimensions: *comp: completeness, prec: precision, rele: releva
 
 | **Model Name** | p_{0, comp} | p_{0, comp} | p_{2, comp} | s_{compl} | p_{0, prec} | p_{0, prec} | p_{2, prec} | s_{prec} | p_{0, rele} | p_{0, rele} | p_{2, rele} | s_{rele} | s_{sum} | 
 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
-| idefics | 28.91% | 59.16% | 11.93% | 0.83/2.00 |  34.68% | 27.86% | 37.46% | 1.03/2.00 |  3.90% | 59.66% | 36.44% | 1.33/2.00 |  3.18/6.00 |
-| instructblip_t5 | 23.16% | 66.44% | 10.40% | 0.87/2.00 |  34.85% | 26.03% | 39.12% | 1.04/2.00 |  14.71% | 59.87% | 25.42% | 1.11/2.00 |  3.02/6.00 |
-| instructblip_vicuna | 29.73% | 61.47% | 8.80% | 0.79/2.00 |  27.84% | 23.52% | 48.65% | 1.21/2.00 |  27.40% | 61.29% | 11.31% | 0.84/2.00 |  2.84/6.00 |
-| internlm_xcomposer_vl | 19.19% | 70.39% | 10.42% | 0.91/2.00 |  28.23% | 32.38% | 39.39% | 1.11/2.00 |  8.87% | 65.38% | 25.76% | 1.17/2.00 |  3.19/6.00 |
-| kosmos_2 | 8.76% | 70.91% | 20.33% | **1.12/2.00** (rank 1) |  29.45% | 34.75% | 35.81% | 1.06/2.00 |  0.16% | 14.77% | 85.06% | **1.85/2.00** (rank 1) |  **4.03/6.00** (rank 1) |
-| llama_adapter_v2 | 30.44% | 53.99% | 15.57% | 0.85/2.00 |  29.41% | 25.79% | 44.80% | 1.15/2.00 |  1.50% | 52.75% | 45.75% | 1.44/2.00 |  3.45/6.00 |
-| llava_v1.5 | 27.68% | 53.78% | 18.55% | 0.91/2.00 |  25.45% | 21.47% | 53.08% | **1.28/2.00** (rank 1) |  6.31% | 58.75% | 34.94% | 1.29/2.00 |  3.47/6.00 |
-| llava_v1 | 34.10% | 40.52% | 25.39% | 0.91/2.00 |  30.02% | 15.15% | 54.83% | 1.25/2.00 |  1.06% | 38.03% | 60.91% | 1.60/2.00 |  3.76/6.00 |
-| minigpt4_13b | 34.01% | 32.15% | 33.85% | 1.00/2.00 |  29.20% | 15.27% | 55.53% | 1.26/2.00 |  6.88% | 45.65% | 47.48% | 1.41/2.00 |  3.67/6.00 |
-| mplug_owl | 28.28% | 37.69% | 34.03% | 1.06/2.00 |  26.75% | 18.18% | 55.07% | **1.28/2.00** (rank 1) |  3.03% | 33.82% | 63.15% | 1.60/2.00 |  3.94/6.00 |
-| otter_v1 | 22.38% | 59.36% | 18.25% | 0.96/2.00 |  40.68% | 35.99% | 23.33% | 0.83/2.00 |  1.95% | 13.20% | 84.85% | 1.83/2.00 |  3.61/6.00 |
-| qwen_vl | 26.34% | 49.13% | 24.53% | 0.98/2.00 |  50.62% | 23.44% | 25.94% | 0.75/2.00 |  0.73% | 35.56% | 63.72% | 1.63/2.00 |  3.36/6.00 |
-| shikra | 21.14% | 68.33% | 10.52% | 0.89/2.00 |  30.33% | 28.30% | 41.37% | 1.11/2.00 |  1.14% | 64.36% | 34.50% | 1.33/2.00 |  3.34/6.00 |
-| visualglm | 30.75% | 56.64% | 12.61% | 0.82/2.00 |  38.64% | 26.18% | 35.18% | 0.97/2.00 |  6.14% | 67.15% | 26.71% | 1.21/2.00 |  2.99/6.00 |
+| LLaVA-v1.5 (Vicuna-v1.5-13B) | 27.68%  | 53.78%  | 18.55%  | 0.91/2.00 | 25.45%  | 21.47%  | 53.08%  | 1.28/2.00 | 6.31%   | 58.75%   | 34.94%   | 1.29/2.00 | 3.47/6.00 |
+| InternLM-XComposer (InternLM) | 19.94%  | 51.82%  | 28.24%  | 1.08/2.00 | 22.59%  | 28.99%  | 48.42%  | 1.26/2.00 | 1.05%   | 10.62%   | 88.32%   | 1.87/2.00 | 4.21/6.00 |
+| IDEFICS-Instruct (LLaMA-7B) | 28.91%  | 59.16%  | 11.93%  | 0.83/2.00 | 34.68%  | 27.86%  | 37.46%  | 1.03/2.00 | 3.90%   | 59.66%   | 36.44%   | 1.33/2.00 | 3.18/6.00 |
+| Qwen-VL (QwenLM)           | 26.34%  | 49.13%  | 24.53%  | 0.98/2.00 | 50.62%  | 23.44%  | 25.94%  | 0.75/2.00 | 0.73%   | 35.56%   | 63.72%   | 1.63/2.00 | 3.36/6.00 |
+| Shikra (Vicuna-7B)         | 21.14%  | 68.33%  | 10.52%  | 0.89/2.00 | 30.33%  | 28.30%  | 41.37%  | 1.11/2.00 | 1.14%   | 64.36%   | 34.50%   | 1.33/2.00 | 3.34/6.00 |
+| Otter-v1 (MPT-7B)          | 22.38%  | 59.36%  | 18.25%  | 0.96/2.00 | 40.68%  | 35.99%  | 23.33%  | 0.83/2.00 | 1.95%   | 13.2%    | 84.85%   | 1.83/2.00 | 3.61/6.00 |
+| Kosmos-2                   | 8.76%   | 70.91%  | 20.33%  | 1.12/2.00 | 29.45%  | 34.75%  | 35.81%  | 1.06/2.00 | 0.16%   | 14.77%   | 85.06%   | 1.85/2.00 | 4.03/6.00 |
+| InstructBLIP (Flan-T5-XL)  | 23.16%  | 66.44%  | 10.40%  | 0.87/2.00 | 34.85%  | 26.03%  | 39.12%  | 1.04/2.00 | 14.71%  | 59.87%   | 25.42%   | 1.11/2.00 | 3.02/6.00 |
+| InstructBLIP (Vicuna-7B)   | 29.73%  | 61.47%  | 8.80%   | 0.79/2.00 | 27.84%  | 23.52%  | 48.65%  | 1.21/2.00 | 27.40%  | 61.29%   | 11.31%   | 0.84/2.00 | 2.84/6.00 |
+| VisualGLM-6B (GLM-6B)      | 30.75%  | 56.64%  | 12.61%  | 0.82/2.00 | 38.64%  | 26.18%  | 35.18%  | 0.97/2.00 | 6.14%   | 67.15%   | 26.71%   | 1.21/2.00 | 2.99/6.00 |
+| mPLUG-Owl (LLaMA-7B)       | 28.28%  | 37.69%  | 34.03%  | 1.06/2.00 | 26.75%  | 18.18%  | 55.07%  | 1.28/2.00 | 3.03%   | 33.82%   | 63.15%   | 1.6/2.00  | 3.94/6.00 |
+| LLaMA-Adapter-V2           | 30.44%  | 53.99%  | 15.57%  | 0.85/2.00 | 29.41%  | 25.79%  | 44.8%   | 1.15/2.00 | 1.50%   | 52.75%   | 45.75%   | 1.44/2.00 | 3.45/6.00 |
+| LLaVA-v1 (Vicuna-13B)      | 34.10%  | 40.52%  | 25.39%  | 0.91/2.00 | 30.02%  | 15.15%  | 54.83%  | 1.25/2.00 | 1.06%   | 38.03%   | 60.91%   | 1.6/2.00  | 3.76/6.00 |
+| MiniGPT-4 (Vicuna-13B)     | 34.01%  | 32.15%  | 33.85%  | 1.00/2.00 | 29.20%  | 15.27%  | 55.53%  | 1.26/2.00 | 6.88%   | 45.65%   | 47.48%   | 1.41/2.00 | 3.67/6.00 |
 
 
 ## Leaderboards for (A3): Assessment