separate category for global_mmlu (#2652)

* separate category * set version 0.0 * apply precommit
EleutherAI · Jan 24, 2025 · 5c006ed · 5c006ed
1 parent 370e2f9
commit 5c006ed
Show file tree

Hide file tree

Showing 193 changed files with 1,092 additions and 146 deletions.
diff --git a/lm_eval/tasks/global_mmlu/default/_generate_configs.py b/lm_eval/tasks/global_mmlu/default/_generate_configs.py
diff --git a/...l/tasks/global_mmlu/default/_default_yaml → .../global_mmlu/default/ar/_ar_template_yaml b/...l/tasks/global_mmlu/default/_default_yaml → .../global_mmlu/default/ar/_ar_template_yaml
@@ -1,6 +1,5 @@
-tag:
-  - global_mmlu
 dataset_path: CohereForAI/Global-MMLU-Lite
+dataset_name: ar
 test_split: test
 fewshot_split: dev
 fewshot_config:

diff --git a/lm_eval/tasks/global_mmlu/default/ar/_global_mmlu_ar.yaml b/lm_eval/tasks/global_mmlu/default/ar/_global_mmlu_ar.yaml
@@ -0,0 +1,13 @@
+group: global_mmlu_ar
+task:
+  - global_mmlu_ar_business
+  - global_mmlu_ar_humanities
+  - global_mmlu_ar_medical
+  - global_mmlu_ar_other
+  - global_mmlu_ar_stem
+  - global_mmlu_ar_social_sciences
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_business.yaml b/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_business.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _ar_template_yaml
+process_docs: !function utils.process_business
+task: global_mmlu_ar_business
diff --git a/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_humanities.yaml b/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_humanities.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _ar_template_yaml
+process_docs: !function utils.process_humanities
+task: global_mmlu_ar_humanities
diff --git a/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_medical.yaml b/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_medical.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _ar_template_yaml
+process_docs: !function utils.process_medical
+task: global_mmlu_ar_medical
diff --git a/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_other.yaml b/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_other.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _ar_template_yaml
+process_docs: !function utils.process_other
+task: global_mmlu_ar_other
diff --git a/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_social_sciences.yaml b/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_social_sciences.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _ar_template_yaml
+process_docs: !function utils.process_social_sciences
+task: global_mmlu_ar_social_sciences
diff --git a/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_stem.yaml b/lm_eval/tasks/global_mmlu/default/ar/global_mmlu_ar_stem.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _ar_template_yaml
+process_docs: !function utils.process_stem
+task: global_mmlu_ar_stem
diff --git a/lm_eval/tasks/global_mmlu/default/ar/utils.py b/lm_eval/tasks/global_mmlu/default/ar/utils.py
@@ -0,0 +1,18 @@
+from functools import partial
+
+
+CATEGORIES = ["Business", "Humanities", "Medical", "Other", "STEM", "Social Sciences"]
+
+
+def process_docs(dataset, category):
+    return dataset.filter(lambda x: x["subject_category"] == category)
+
+
+process_functions = {
+    f"process_{category.lower().replace(' ', '_')}": partial(
+        process_docs, category=category
+    )
+    for category in CATEGORIES
+}
+
+globals().update(process_functions)
diff --git a/lm_eval/tasks/global_mmlu/default/bn/_bn_template_yaml b/lm_eval/tasks/global_mmlu/default/bn/_bn_template_yaml
@@ -0,0 +1,16 @@
+dataset_path: CohereForAI/Global-MMLU-Lite
+dataset_name: bn
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/bn/_global_mmlu_bn.yaml b/lm_eval/tasks/global_mmlu/default/bn/_global_mmlu_bn.yaml
@@ -0,0 +1,13 @@
+group: global_mmlu_bn
+task:
+  - global_mmlu_bn_business
+  - global_mmlu_bn_humanities
+  - global_mmlu_bn_medical
+  - global_mmlu_bn_other
+  - global_mmlu_bn_stem
+  - global_mmlu_bn_social_sciences
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_business.yaml b/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_business.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _bn_template_yaml
+process_docs: !function utils.process_business
+task: global_mmlu_bn_business
diff --git a/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_humanities.yaml b/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_humanities.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _bn_template_yaml
+process_docs: !function utils.process_humanities
+task: global_mmlu_bn_humanities
diff --git a/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_medical.yaml b/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_medical.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _bn_template_yaml
+process_docs: !function utils.process_medical
+task: global_mmlu_bn_medical
diff --git a/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_other.yaml b/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_other.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _bn_template_yaml
+process_docs: !function utils.process_other
+task: global_mmlu_bn_other
diff --git a/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_social_sciences.yaml b/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_social_sciences.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _bn_template_yaml
+process_docs: !function utils.process_social_sciences
+task: global_mmlu_bn_social_sciences
diff --git a/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_stem.yaml b/lm_eval/tasks/global_mmlu/default/bn/global_mmlu_bn_stem.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _bn_template_yaml
+process_docs: !function utils.process_stem
+task: global_mmlu_bn_stem
diff --git a/lm_eval/tasks/global_mmlu/default/bn/utils.py b/lm_eval/tasks/global_mmlu/default/bn/utils.py
@@ -0,0 +1,18 @@
+from functools import partial
+
+
+CATEGORIES = ["Business", "Humanities", "Medical", "Other", "STEM", "Social Sciences"]
+
+
+def process_docs(dataset, category):
+    return dataset.filter(lambda x: x["subject_category"] == category)
+
+
+process_functions = {
+    f"process_{category.lower().replace(' ', '_')}": partial(
+        process_docs, category=category
+    )
+    for category in CATEGORIES
+}
+
+globals().update(process_functions)
diff --git a/lm_eval/tasks/global_mmlu/default/de/_de_template_yaml b/lm_eval/tasks/global_mmlu/default/de/_de_template_yaml
@@ -0,0 +1,16 @@
+dataset_path: CohereForAI/Global-MMLU-Lite
+dataset_name: de
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/de/_global_mmlu_de.yaml b/lm_eval/tasks/global_mmlu/default/de/_global_mmlu_de.yaml
@@ -0,0 +1,13 @@
+group: global_mmlu_de
+task:
+  - global_mmlu_de_business
+  - global_mmlu_de_humanities
+  - global_mmlu_de_medical
+  - global_mmlu_de_other
+  - global_mmlu_de_stem
+  - global_mmlu_de_social_sciences
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_business.yaml b/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_business.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _de_template_yaml
+process_docs: !function utils.process_business
+task: global_mmlu_de_business
diff --git a/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_humanities.yaml b/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_humanities.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _de_template_yaml
+process_docs: !function utils.process_humanities
+task: global_mmlu_de_humanities
diff --git a/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_medical.yaml b/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_medical.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _de_template_yaml
+process_docs: !function utils.process_medical
+task: global_mmlu_de_medical
diff --git a/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_other.yaml b/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_other.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _de_template_yaml
+process_docs: !function utils.process_other
+task: global_mmlu_de_other
diff --git a/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_social_sciences.yaml b/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_social_sciences.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _de_template_yaml
+process_docs: !function utils.process_social_sciences
+task: global_mmlu_de_social_sciences
diff --git a/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_stem.yaml b/lm_eval/tasks/global_mmlu/default/de/global_mmlu_de_stem.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _de_template_yaml
+process_docs: !function utils.process_stem
+task: global_mmlu_de_stem
diff --git a/lm_eval/tasks/global_mmlu/default/de/utils.py b/lm_eval/tasks/global_mmlu/default/de/utils.py
@@ -0,0 +1,18 @@
+from functools import partial
+
+
+CATEGORIES = ["Business", "Humanities", "Medical", "Other", "STEM", "Social Sciences"]
+
+
+def process_docs(dataset, category):
+    return dataset.filter(lambda x: x["subject_category"] == category)
+
+
+process_functions = {
+    f"process_{category.lower().replace(' ', '_')}": partial(
+        process_docs, category=category
+    )
+    for category in CATEGORIES
+}
+
+globals().update(process_functions)
diff --git a/lm_eval/tasks/global_mmlu/default/en/_en_template_yaml b/lm_eval/tasks/global_mmlu/default/en/_en_template_yaml
@@ -0,0 +1,16 @@
+dataset_path: CohereForAI/Global-MMLU-Lite
+dataset_name: en
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/en/_global_mmlu_en.yaml b/lm_eval/tasks/global_mmlu/default/en/_global_mmlu_en.yaml
@@ -0,0 +1,13 @@
+group: global_mmlu_en
+task:
+  - global_mmlu_en_business
+  - global_mmlu_en_humanities
+  - global_mmlu_en_medical
+  - global_mmlu_en_other
+  - global_mmlu_en_stem
+  - global_mmlu_en_social_sciences
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_business.yaml b/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_business.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _en_template_yaml
+process_docs: !function utils.process_business
+task: global_mmlu_en_business
diff --git a/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_humanities.yaml b/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_humanities.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _en_template_yaml
+process_docs: !function utils.process_humanities
+task: global_mmlu_en_humanities
diff --git a/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_medical.yaml b/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_medical.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _en_template_yaml
+process_docs: !function utils.process_medical
+task: global_mmlu_en_medical
diff --git a/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_other.yaml b/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_other.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _en_template_yaml
+process_docs: !function utils.process_other
+task: global_mmlu_en_other
diff --git a/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_social_sciences.yaml b/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_social_sciences.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _en_template_yaml
+process_docs: !function utils.process_social_sciences
+task: global_mmlu_en_social_sciences
diff --git a/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_stem.yaml b/lm_eval/tasks/global_mmlu/default/en/global_mmlu_en_stem.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _en_template_yaml
+process_docs: !function utils.process_stem
+task: global_mmlu_en_stem
diff --git a/lm_eval/tasks/global_mmlu/default/en/utils.py b/lm_eval/tasks/global_mmlu/default/en/utils.py
@@ -0,0 +1,18 @@
+from functools import partial
+
+
+CATEGORIES = ["Business", "Humanities", "Medical", "Other", "STEM", "Social Sciences"]
+
+
+def process_docs(dataset, category):
+    return dataset.filter(lambda x: x["subject_category"] == category)
+
+
+process_functions = {
+    f"process_{category.lower().replace(' ', '_')}": partial(
+        process_docs, category=category
+    )
+    for category in CATEGORIES
+}
+
+globals().update(process_functions)
diff --git a/lm_eval/tasks/global_mmlu/default/es/_es_template_yaml b/lm_eval/tasks/global_mmlu/default/es/_es_template_yaml
@@ -0,0 +1,16 @@
+dataset_path: CohereForAI/Global-MMLU-Lite
+dataset_name: es
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/es/_global_mmlu_es.yaml b/lm_eval/tasks/global_mmlu/default/es/_global_mmlu_es.yaml
@@ -0,0 +1,13 @@
+group: global_mmlu_es
+task:
+  - global_mmlu_es_business
+  - global_mmlu_es_humanities
+  - global_mmlu_es_medical
+  - global_mmlu_es_other
+  - global_mmlu_es_stem
+  - global_mmlu_es_social_sciences
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/global_mmlu/default/es/global_mmlu_es_business.yaml b/lm_eval/tasks/global_mmlu/default/es/global_mmlu_es_business.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _es_template_yaml
+process_docs: !function utils.process_business
+task: global_mmlu_es_business
diff --git a/lm_eval/tasks/global_mmlu/default/es/global_mmlu_es_humanities.yaml b/lm_eval/tasks/global_mmlu/default/es/global_mmlu_es_humanities.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _es_template_yaml
+process_docs: !function utils.process_humanities
+task: global_mmlu_es_humanities
diff --git a/lm_eval/tasks/global_mmlu/default/es/global_mmlu_es_medical.yaml b/lm_eval/tasks/global_mmlu/default/es/global_mmlu_es_medical.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+include: _es_template_yaml
+process_docs: !function utils.process_medical
+task: global_mmlu_es_medical