From 7e0c0915b352fa0ab4056c920e48ec7e70596789 Mon Sep 17 00:00:00 2001 From: acc12649wd Date: Sun, 27 Aug 2023 09:56:56 +0900 Subject: [PATCH 1/2] add evaluation results for weblab-10b models --- .../weblab-10b-instruction-sft/harness.sh | 14 ++++ .../weblab-10b-instruction-sft/result.json | 71 +++++++++++++++++++ models/matsuo-lab/weblab-10b/harness.sh | 14 ++++ models/matsuo-lab/weblab-10b/result.json | 71 +++++++++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 models/matsuo-lab/weblab-10b-instruction-sft/harness.sh create mode 100644 models/matsuo-lab/weblab-10b-instruction-sft/result.json create mode 100644 models/matsuo-lab/weblab-10b/harness.sh create mode 100644 models/matsuo-lab/weblab-10b/result.json diff --git a/models/matsuo-lab/weblab-10b-instruction-sft/harness.sh b/models/matsuo-lab/weblab-10b-instruction-sft/harness.sh new file mode 100644 index 0000000000..9e545d5488 --- /dev/null +++ b/models/matsuo-lab/weblab-10b-instruction-sft/harness.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +MODEL_NAME="weblab-10b-instruction-sft" +MODEL_ARGS="pretrained=matsuo-lab/${MODEL_NAME},torch_dtype=auto" +TASK="jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jsquad-1.1-0.3,jaqket_v2-0.2-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3" +NUM_FEWSHOT="3,3,3,2,1,1,0,5" +OUTPUT_PATH="models/matsuo-lab/${MODEL_NAME}/result.json" +python main.py \ + --model hf-causal \ + --model_args $MODEL_ARGS \ + --tasks $TASK \ + --num_fewshot $NUM_FEWSHOT \ + --device "cuda" \ + --output_path $OUTPUT_PATH diff --git a/models/matsuo-lab/weblab-10b-instruction-sft/result.json b/models/matsuo-lab/weblab-10b-instruction-sft/result.json new file mode 100644 index 0000000000..a5cb71a93d --- /dev/null +++ b/models/matsuo-lab/weblab-10b-instruction-sft/result.json @@ -0,0 +1,71 @@ +{ + "results": { + "jcommonsenseqa-1.1-0.3": { + "acc": 0.7462019660411081, + "acc_stderr": 0.013015217181453645, + "acc_norm": 0.7444146559428061, + "acc_norm_stderr": 0.013045313758426097 + }, + "jnli-1.1-0.3": { + "acc": 0.6655710764174199, + "acc_stderr": 0.009564848188061246, + "acc_norm": 0.5349219391947412, + "acc_norm_stderr": 0.010112000378231595 + }, + "marc_ja-1.1-0.3": { + "acc": 0.9548991864166961, + "acc_stderr": 0.0027601421998296426, + "acc_norm": 0.9548991864166961, + "acc_norm_stderr": 0.0027601421998296426 + }, + "xwinograd_ja": { + "acc": 0.7194994786235662, + "acc_stderr": 0.014514407890552954 + }, + "jsquad-1.1-0.3": { + "exact_match": 78.34308869878433, + "f1": 84.6243755976619 + }, + "jaqket_v2-0.2-0.3": { + "exact_match": 63.31615120274914, + "f1": 66.73484972454041 + }, + "xlsum_ja-1.0-0.3": { + "rouge2": 20.569208532135303 + }, + "mgsm-1.0-0.3": { + "acc": 0.02, + "acc_stderr": 0.008872139507342685 + } + }, + "versions": { + "jcommonsenseqa-1.1-0.3": 1.1, + "jnli-1.1-0.3": 1.1, + "marc_ja-1.1-0.3": 1.1, + "jsquad-1.1-0.3": 1.1, + "jaqket_v2-0.2-0.3": 0.2, + "xlsum_ja-1.0-0.3": 1.0, + "xwinograd_ja": 1.0, + "mgsm-1.0-0.3": 1.0 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=matsuo-lab/weblab-10b-instruction-sft,torch_dtype=auto", + "num_fewshot": [ + 3, + 3, + 3, + 2, + 1, + 1, + 0, + 5 + ], + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/models/matsuo-lab/weblab-10b/harness.sh b/models/matsuo-lab/weblab-10b/harness.sh new file mode 100644 index 0000000000..0266e900b0 --- /dev/null +++ b/models/matsuo-lab/weblab-10b/harness.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +MODEL_NAME="weblab-10b" +MODEL_ARGS="pretrained=matsuo-lab/${MODEL_NAME},torch_dtype=auto" +TASK="jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jsquad-1.1-0.3,jaqket_v2-0.2-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3" +NUM_FEWSHOT="3,3,3,2,1,1,0,5" +OUTPUT_PATH="models/matsuo-lab/${MODEL_NAME}/result.json" +python main.py \ + --model hf-causal \ + --model_args $MODEL_ARGS \ + --tasks $TASK \ + --num_fewshot $NUM_FEWSHOT \ + --device "cuda" \ + --output_path $OUTPUT_PATH diff --git a/models/matsuo-lab/weblab-10b/result.json b/models/matsuo-lab/weblab-10b/result.json new file mode 100644 index 0000000000..ecb10f2c2d --- /dev/null +++ b/models/matsuo-lab/weblab-10b/result.json @@ -0,0 +1,71 @@ +{ + "results": { + "jcommonsenseqa-1.1-0.3": { + "acc": 0.6657730116175157, + "acc_stderr": 0.014107918728785251, + "acc_norm": 0.6157283288650581, + "acc_norm_stderr": 0.0145476502199282 + }, + "jnli-1.1-0.3": { + "acc": 0.5373870172555464, + "acc_stderr": 0.010108377185797195, + "acc_norm": 0.49671322925225964, + "acc_norm_stderr": 0.0101365359837632 + }, + "marc_ja-1.1-0.3": { + "acc": 0.8206579412805094, + "acc_stderr": 0.005102495976041424, + "acc_norm": 0.8206579412805094, + "acc_norm_stderr": 0.005102495976041424 + }, + "xwinograd_ja": { + "acc": 0.7194994786235662, + "acc_stderr": 0.014514407890552965 + }, + "jsquad-1.1-0.3": { + "exact_match": 62.94461954074741, + "f1": 71.39321494073558 + }, + "jaqket_v2-0.2-0.3": { + "exact_match": 56.18556701030928, + "f1": 60.1091610756559 + }, + "xlsum_ja-1.0-0.3": { + "rouge2": 10.025750779559178 + }, + "mgsm-1.0-0.3": { + "acc": 0.024, + "acc_stderr": 0.009699087026964242 + } + }, + "versions": { + "jcommonsenseqa-1.1-0.3": 1.1, + "jnli-1.1-0.3": 1.1, + "marc_ja-1.1-0.3": 1.1, + "jsquad-1.1-0.3": 1.1, + "jaqket_v2-0.2-0.3": 0.2, + "xlsum_ja-1.0-0.3": 1.0, + "xwinograd_ja": 1.0, + "mgsm-1.0-0.3": 1.0 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=matsuo-lab/weblab-10b,torch_dtype=auto", + "num_fewshot": [ + 3, + 3, + 3, + 2, + 1, + 1, + 0, + 5 + ], + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file From 20a7315250ed5db0d9291abaa9d67f2a93907cc4 Mon Sep 17 00:00:00 2001 From: acc12649wd Date: Sun, 27 Aug 2023 10:11:53 +0900 Subject: [PATCH 2/2] add evaluation results for weblab-10b models --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c6b6e7d206..4e4a3b71a7 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,10 @@ ## Leaderboard | model | average | jcommonsenseqa | jnli | marc_ja | jsquad | jaqket_v2 | xlsum_ja | xwinograd_ja | mgsm | eval script | |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------:|-----------------:|-------:|----------:|---------:|------------:|-----------:|---------------:|-------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| matsuo-lab-weblab-10b-instruction-sft | 59.11 | 74.62 | 66.56 | 95.49 | 78.34 | 63.32 | 20.57 | 71.95 | 2 | models/matsuo-lab/weblab-10b-instruction-sft/harness.sh | | stabilityai-japanese-stablelm-instruct-alpha-7b | 54.71 | 82.22 | 52.05 | 82.88 | 63.26 | 74.83 | 7.79 | 72.68 | 2 | models/stabilityai/stabilityai-japanese-stablelm-instruct-alpha-7b/harness.sh | | stabilityai-japanese-stablelm-base-alpha-7b | 51.06 | 33.42 | 43.34 | 96.73 | 70.62 | 78.09 | 10.65 | 72.78 | 2.8 | models/stabilityai/stabilityai-japanese-stablelm-base-alpha-7b/harness.sh | +| matsuo-lab-weblab-10b | 50.74 | 66.58 | 53.74 | 82.07 | 62.94 | 56.19 | 10.03 | 71.95 | 2.4 | models/matsuo-lab/weblab-10b/harness.sh | | rinna-bilingual-gpt-neox-4b-instruction-sft | 47.75 | 49.51 | 47.08 | 95.28 | 55.99 | 61.17 | 5.51 | 64.65 | 2.8 | models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/harness.sh | | rinna-bilingual-gpt-neox-4b-instruction-ppo | 47.18 | 48.79 | 48.23 | 96.09 | 54.16 | 57.65 | 5.03 | 65.07 | 2.4 | models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/harness.sh | | llama2-13b-chat | 47.02 | 72.56 | 35.62 | 59.92 | 67.69 | 48.2 | 15.14 | 63.82 | 13.2 | models/llama2/llama2-13b-chat/harness.sh |