From 7e0c0915b352fa0ab4056c920e48ec7e70596789 Mon Sep 17 00:00:00 2001
From: acc12649wd <acc12649wd@es2.abci.local>
Date: Sun, 27 Aug 2023 09:56:56 +0900
Subject: [PATCH 1/2] add evaluation results for weblab-10b models

---
 .../weblab-10b-instruction-sft/harness.sh     | 14 ++++
 .../weblab-10b-instruction-sft/result.json    | 71 +++++++++++++++++++
 models/matsuo-lab/weblab-10b/harness.sh       | 14 ++++
 models/matsuo-lab/weblab-10b/result.json      | 71 +++++++++++++++++++
 4 files changed, 170 insertions(+)
 create mode 100644 models/matsuo-lab/weblab-10b-instruction-sft/harness.sh
 create mode 100644 models/matsuo-lab/weblab-10b-instruction-sft/result.json
 create mode 100644 models/matsuo-lab/weblab-10b/harness.sh
 create mode 100644 models/matsuo-lab/weblab-10b/result.json

diff --git a/models/matsuo-lab/weblab-10b-instruction-sft/harness.sh b/models/matsuo-lab/weblab-10b-instruction-sft/harness.sh
new file mode 100644
index 0000000000..9e545d5488
--- /dev/null
+++ b/models/matsuo-lab/weblab-10b-instruction-sft/harness.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+MODEL_NAME="weblab-10b-instruction-sft"
+MODEL_ARGS="pretrained=matsuo-lab/${MODEL_NAME},torch_dtype=auto"
+TASK="jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jsquad-1.1-0.3,jaqket_v2-0.2-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
+NUM_FEWSHOT="3,3,3,2,1,1,0,5"
+OUTPUT_PATH="models/matsuo-lab/${MODEL_NAME}/result.json"
+python main.py \
+    --model hf-causal \
+    --model_args $MODEL_ARGS \
+    --tasks $TASK \
+    --num_fewshot $NUM_FEWSHOT \
+    --device "cuda" \
+    --output_path $OUTPUT_PATH
diff --git a/models/matsuo-lab/weblab-10b-instruction-sft/result.json b/models/matsuo-lab/weblab-10b-instruction-sft/result.json
new file mode 100644
index 0000000000..a5cb71a93d
--- /dev/null
+++ b/models/matsuo-lab/weblab-10b-instruction-sft/result.json
@@ -0,0 +1,71 @@
+{
+  "results": {
+    "jcommonsenseqa-1.1-0.3": {
+      "acc": 0.7462019660411081,
+      "acc_stderr": 0.013015217181453645,
+      "acc_norm": 0.7444146559428061,
+      "acc_norm_stderr": 0.013045313758426097
+    },
+    "jnli-1.1-0.3": {
+      "acc": 0.6655710764174199,
+      "acc_stderr": 0.009564848188061246,
+      "acc_norm": 0.5349219391947412,
+      "acc_norm_stderr": 0.010112000378231595
+    },
+    "marc_ja-1.1-0.3": {
+      "acc": 0.9548991864166961,
+      "acc_stderr": 0.0027601421998296426,
+      "acc_norm": 0.9548991864166961,
+      "acc_norm_stderr": 0.0027601421998296426
+    },
+    "xwinograd_ja": {
+      "acc": 0.7194994786235662,
+      "acc_stderr": 0.014514407890552954
+    },
+    "jsquad-1.1-0.3": {
+      "exact_match": 78.34308869878433,
+      "f1": 84.6243755976619
+    },
+    "jaqket_v2-0.2-0.3": {
+      "exact_match": 63.31615120274914,
+      "f1": 66.73484972454041
+    },
+    "xlsum_ja-1.0-0.3": {
+      "rouge2": 20.569208532135303
+    },
+    "mgsm-1.0-0.3": {
+      "acc": 0.02,
+      "acc_stderr": 0.008872139507342685
+    }
+  },
+  "versions": {
+    "jcommonsenseqa-1.1-0.3": 1.1,
+    "jnli-1.1-0.3": 1.1,
+    "marc_ja-1.1-0.3": 1.1,
+    "jsquad-1.1-0.3": 1.1,
+    "jaqket_v2-0.2-0.3": 0.2,
+    "xlsum_ja-1.0-0.3": 1.0,
+    "xwinograd_ja": 1.0,
+    "mgsm-1.0-0.3": 1.0
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=matsuo-lab/weblab-10b-instruction-sft,torch_dtype=auto",
+    "num_fewshot": [
+      3,
+      3,
+      3,
+      2,
+      1,
+      1,
+      0,
+      5
+    ],
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/models/matsuo-lab/weblab-10b/harness.sh b/models/matsuo-lab/weblab-10b/harness.sh
new file mode 100644
index 0000000000..0266e900b0
--- /dev/null
+++ b/models/matsuo-lab/weblab-10b/harness.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+MODEL_NAME="weblab-10b"
+MODEL_ARGS="pretrained=matsuo-lab/${MODEL_NAME},torch_dtype=auto"
+TASK="jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jsquad-1.1-0.3,jaqket_v2-0.2-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
+NUM_FEWSHOT="3,3,3,2,1,1,0,5"
+OUTPUT_PATH="models/matsuo-lab/${MODEL_NAME}/result.json"
+python main.py \
+    --model hf-causal \
+    --model_args $MODEL_ARGS \
+    --tasks $TASK \
+    --num_fewshot $NUM_FEWSHOT \
+    --device "cuda" \
+    --output_path $OUTPUT_PATH
diff --git a/models/matsuo-lab/weblab-10b/result.json b/models/matsuo-lab/weblab-10b/result.json
new file mode 100644
index 0000000000..ecb10f2c2d
--- /dev/null
+++ b/models/matsuo-lab/weblab-10b/result.json
@@ -0,0 +1,71 @@
+{
+  "results": {
+    "jcommonsenseqa-1.1-0.3": {
+      "acc": 0.6657730116175157,
+      "acc_stderr": 0.014107918728785251,
+      "acc_norm": 0.6157283288650581,
+      "acc_norm_stderr": 0.0145476502199282
+    },
+    "jnli-1.1-0.3": {
+      "acc": 0.5373870172555464,
+      "acc_stderr": 0.010108377185797195,
+      "acc_norm": 0.49671322925225964,
+      "acc_norm_stderr": 0.0101365359837632
+    },
+    "marc_ja-1.1-0.3": {
+      "acc": 0.8206579412805094,
+      "acc_stderr": 0.005102495976041424,
+      "acc_norm": 0.8206579412805094,
+      "acc_norm_stderr": 0.005102495976041424
+    },
+    "xwinograd_ja": {
+      "acc": 0.7194994786235662,
+      "acc_stderr": 0.014514407890552965
+    },
+    "jsquad-1.1-0.3": {
+      "exact_match": 62.94461954074741,
+      "f1": 71.39321494073558
+    },
+    "jaqket_v2-0.2-0.3": {
+      "exact_match": 56.18556701030928,
+      "f1": 60.1091610756559
+    },
+    "xlsum_ja-1.0-0.3": {
+      "rouge2": 10.025750779559178
+    },
+    "mgsm-1.0-0.3": {
+      "acc": 0.024,
+      "acc_stderr": 0.009699087026964242
+    }
+  },
+  "versions": {
+    "jcommonsenseqa-1.1-0.3": 1.1,
+    "jnli-1.1-0.3": 1.1,
+    "marc_ja-1.1-0.3": 1.1,
+    "jsquad-1.1-0.3": 1.1,
+    "jaqket_v2-0.2-0.3": 0.2,
+    "xlsum_ja-1.0-0.3": 1.0,
+    "xwinograd_ja": 1.0,
+    "mgsm-1.0-0.3": 1.0
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=matsuo-lab/weblab-10b,torch_dtype=auto",
+    "num_fewshot": [
+      3,
+      3,
+      3,
+      2,
+      1,
+      1,
+      0,
+      5
+    ],
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file

From 20a7315250ed5db0d9291abaa9d67f2a93907cc4 Mon Sep 17 00:00:00 2001
From: acc12649wd <acc12649wd@es2.abci.local>
Date: Sun, 27 Aug 2023 10:11:53 +0900
Subject: [PATCH 2/2] add evaluation results for weblab-10b models

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index c6b6e7d206..4e4a3b71a7 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,10 @@
 ## Leaderboard
 | model                                                                                                                                                                                                                                                 |   average |   jcommonsenseqa |   jnli |   marc_ja |   jsquad |   jaqket_v2 |   xlsum_ja |   xwinograd_ja |   mgsm | eval script                                                                                                                                                                                                                                                                                                                                                     |
 |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------:|-----------------:|-------:|----------:|---------:|------------:|-----------:|---------------:|-------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| <a target="_blank" href="https://huggingface.co/matsuo-lab/weblab-10b-instruction-sft" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">matsuo-lab-weblab-10b-instruction-sft</a>                     |     59.11 |            74.62 |  66.56 |     95.49 |    78.34 |       63.32 |      20.57 |          71.95 |    2   | <a target="_blank" href="https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/models/matsuo-lab/weblab-10b-instruction-sft/harness.sh" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">models/matsuo-lab/weblab-10b-instruction-sft/harness.sh</a> |
 | <a target="_blank" href="https://huggingface.co/stabilityai/japanese-stablelm-instruct-alpha-7b" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">stabilityai-japanese-stablelm-instruct-alpha-7b</a> |     54.71 |            82.22 |  52.05 |     82.88 |    63.26 |       74.83 |       7.79 |          72.68 |    2   | <a target="_blank" href="https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/models/stabilityai/stabilityai-japanese-stablelm-instruct-alpha-7b/harness.sh" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">models/stabilityai/stabilityai-japanese-stablelm-instruct-alpha-7b/harness.sh</a> |
 | <a target="_blank" href="https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">stabilityai-japanese-stablelm-base-alpha-7b</a>         |     51.06 |            33.42 |  43.34 |     96.73 |    70.62 |       78.09 |      10.65 |          72.78 |    2.8 | <a target="_blank" href="https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/models/stabilityai/stabilityai-japanese-stablelm-base-alpha-7b/harness.sh" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">models/stabilityai/stabilityai-japanese-stablelm-base-alpha-7b/harness.sh</a>         |
+| <a target="_blank" href="https://huggingface.co/matsuo-lab/weblab-10b" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">matsuo-lab-weblab-10b</a>                                                     |     50.74 |            66.58 |  53.74 |     82.07 |    62.94 |       56.19 |      10.03 |          71.95 |    2.4 | <a target="_blank" href="https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/models/matsuo-lab/weblab-10b/harness.sh" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">models/matsuo-lab/weblab-10b/harness.sh</a> |
 | <a target="_blank" href="https://huggingface.co/rinna/bilingual-gpt-neox-4b-instruction-sft" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">rinna-bilingual-gpt-neox-4b-instruction-sft</a>         |     47.75 |            49.51 |  47.08 |     95.28 |    55.99 |       61.17 |       5.51 |          64.65 |    2.8 | <a target="_blank" href="https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/harness.sh" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/harness.sh</a>                     |
 | <a target="_blank" href="https://huggingface.co/rinna/bilingual-gpt-neox-4b-instruction-ppo" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">rinna-bilingual-gpt-neox-4b-instruction-ppo</a>         |     47.18 |            48.79 |  48.23 |     96.09 |    54.16 |       57.65 |       5.03 |          65.07 |    2.4 | <a target="_blank" href="https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/harness.sh" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/harness.sh</a>                     |
 | <a target="_blank" href="https://huggingface.co/llama2/13b-chat" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">llama2-13b-chat</a>                                                                 |     47.02 |            72.56 |  35.62 |     59.92 |    67.69 |       48.2  |      15.14 |          63.82 |   13.2 | <a target="_blank" href="https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/models/llama2/llama2-13b-chat/harness.sh" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">models/llama2/llama2-13b-chat/harness.sh</a>                                                                           |