From 12b6eeb5b01cd1fe9da103e59b85e2c06bb82c93 Mon Sep 17 00:00:00 2001 From: "Ramiro R. C." Date: Mon, 20 Jan 2025 18:33:42 -0300 Subject: [PATCH] fixed mmlu generative response extraction (#2503) * fixed mmlu generative response extraction * updated file version | added args to exact_match * fix * fix * pre-commit * fix groups --------- Co-authored-by: Baber --- lm_eval/tasks/arabicmmlu/_generate_configs.py | 82 ++++++++++--------- lm_eval/tasks/mmlu/_generate_configs.py | 1 + .../mmlu/generative/_default_template_yaml | 16 +++- lm_eval/tasks/mmlu/generative/_mmlu.yaml | 20 ++--- 4 files changed, 68 insertions(+), 51 deletions(-) diff --git a/lm_eval/tasks/arabicmmlu/_generate_configs.py b/lm_eval/tasks/arabicmmlu/_generate_configs.py index ea59fe9830..5dc627e598 100644 --- a/lm_eval/tasks/arabicmmlu/_generate_configs.py +++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py @@ -13,46 +13,48 @@ eval_logger = logging.getLogger("lm-eval") -SUBJECTS = {'Islamic Studies': 'humanities', - 'Driving Test': 'other', - 'Natural Science (Middle School)': 'stem', - 'Natural Science (Primary School)': 'stem', - 'History (Primary School)': 'humanities', - 'History (Middle School)': 'humanities', - 'History (High School)': 'humanities', - 'General Knowledge': 'other', - 'General Knowledge (Primary School)': 'other', - 'General Knowledge (Middle School)': 'other', - 'Law (Professional)': 'humanities', - 'Physics (High School)': 'stem', - 'Social Science (Middle School)': 'social_science', - 'Social Science (Primary School)': 'social_science', - 'Management (University)': 'other', - 'Arabic Language (Primary School)': 'language', - 'Arabic Language (Middle School)': 'language', - 'Arabic Language (High School)': 'language', - 'Political Science (University)': 'social_science', - 'Philosophy (High School)': 'humanities', - 'Accounting (University)': 'social_science', - 'Computer Science (University)': 'stem', - 'Computer Science (Middle School)': 'stem', - 'Computer Science (Primary School)': 'stem', - 'Computer Science (High School)': 'stem', - 'Geography (Primary School)': 'social_science', - 'Geography (Middle School)': 'social_science', - 'Geography (High School)': 'social_science', - 'Math (Primary School)': 'stem', - 'Biology (High School)': 'stem', - 'Economics (University)': 'social_science', - 'Economics (Middle School)': 'social_science', - 'Economics (High School)': 'social_science', - 'Arabic Language (General)': 'language', - 'Arabic Language (Grammar)': 'language', - 'Islamic Studies (High School)': 'humanities', - 'Islamic Studies (Middle School)': 'humanities', - 'Islamic Studies (Primary School)': 'humanities', - 'Civics (Middle School)': 'social_science', - 'Civics (High School)': 'social_science'} +SUBJECTS = { + "Islamic Studies": "humanities", + "Driving Test": "other", + "Natural Science (Middle School)": "stem", + "Natural Science (Primary School)": "stem", + "History (Primary School)": "humanities", + "History (Middle School)": "humanities", + "History (High School)": "humanities", + "General Knowledge": "other", + "General Knowledge (Primary School)": "other", + "General Knowledge (Middle School)": "other", + "Law (Professional)": "humanities", + "Physics (High School)": "stem", + "Social Science (Middle School)": "social_science", + "Social Science (Primary School)": "social_science", + "Management (University)": "other", + "Arabic Language (Primary School)": "language", + "Arabic Language (Middle School)": "language", + "Arabic Language (High School)": "language", + "Political Science (University)": "social_science", + "Philosophy (High School)": "humanities", + "Accounting (University)": "social_science", + "Computer Science (University)": "stem", + "Computer Science (Middle School)": "stem", + "Computer Science (Primary School)": "stem", + "Computer Science (High School)": "stem", + "Geography (Primary School)": "social_science", + "Geography (Middle School)": "social_science", + "Geography (High School)": "social_science", + "Math (Primary School)": "stem", + "Biology (High School)": "stem", + "Economics (University)": "social_science", + "Economics (Middle School)": "social_science", + "Economics (High School)": "social_science", + "Arabic Language (General)": "language", + "Arabic Language (Grammar)": "language", + "Islamic Studies (High School)": "humanities", + "Islamic Studies (Middle School)": "humanities", + "Islamic Studies (Primary School)": "humanities", + "Civics (Middle School)": "social_science", + "Civics (High School)": "social_science", +} def parse_args(): diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py index 28b94616dd..58876d4c10 100644 --- a/lm_eval/tasks/mmlu/_generate_configs.py +++ b/lm_eval/tasks/mmlu/_generate_configs.py @@ -1,3 +1,4 @@ +# noqa """ Take in a YAML, and output all "other" splits with this YAML """ diff --git a/lm_eval/tasks/mmlu/generative/_default_template_yaml b/lm_eval/tasks/mmlu/generative/_default_template_yaml index 1452e0f5b3..7281f0a1e0 100644 --- a/lm_eval/tasks/mmlu/generative/_default_template_yaml +++ b/lm_eval/tasks/mmlu/generative/_default_template_yaml @@ -14,7 +14,21 @@ metric_list: - metric: exact_match aggregation: mean higher_is_better: true + ignore_punctuation: true + ignore_case: true +filter_list: + - name: get_response + filter: + # Filter everything after the first break line + - function: "regex" + regex_pattern: "^(.*?)(?=\\n|$)" + # Remove leading white spaces + - function: remove_whitespace + # function to ignore right white spaces or line breaks + - function: "regex" + regex_pattern: "^(.*?)\\s*$" + - function: take_first metadata: - version: 2.0 + version: 3.0 dataset_kwargs: trust_remote_code: true diff --git a/lm_eval/tasks/mmlu/generative/_mmlu.yaml b/lm_eval/tasks/mmlu/generative/_mmlu.yaml index 1a63611bdb..e4f4b5d5a8 100644 --- a/lm_eval/tasks/mmlu/generative/_mmlu.yaml +++ b/lm_eval/tasks/mmlu/generative/_mmlu.yaml @@ -5,29 +5,29 @@ task: task: - mmlu_stem_generative aggregate_metric_list: - - metric: acc - weight_by_size: True + - metric: exact_match + weight_by_size: true - group: other task: - mmlu_other_generative aggregate_metric_list: - - metric: acc - weight_by_size: True + - metric: exact_match + weight_by_size: true - group: social sciences task: - mmlu_social_sciences_generative aggregate_metric_list: - - metric: acc - weight_by_size: True + - metric: exact_match + weight_by_size: true - group: humanities task: - mmlu_humanities_generative aggregate_metric_list: - - metric: acc - weight_by_size: True + - metric: exact_match + weight_by_size: true aggregate_metric_list: - aggregation: mean metric: exact_match - weight_by_size: True + weight_by_size: true metadata: - version: 2 + version: 3