From 97dccd3e2cf8380c66030cd3b46e4876a019110a Mon Sep 17 00:00:00 2001 From: Ramiro Rodriguez Colmeiro Date: Mon, 18 Nov 2024 15:47:52 -0300 Subject: [PATCH] fixed mmlu generative response extraction --- .../tasks/mmlu/generative/_default_template_yaml | 12 ++++++++++++ lm_eval/tasks/mmlu/generative/_mmlu.yaml | 13 +++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/lm_eval/tasks/mmlu/generative/_default_template_yaml b/lm_eval/tasks/mmlu/generative/_default_template_yaml index 1452e0f5b3..3667d5612f 100644 --- a/lm_eval/tasks/mmlu/generative/_default_template_yaml +++ b/lm_eval/tasks/mmlu/generative/_default_template_yaml @@ -14,6 +14,18 @@ metric_list: - metric: exact_match aggregation: mean higher_is_better: true +filter_list: + - name: get_response + filter: + # Filter everything after the first break line + - function: "regex" + regex_pattern: "^(.*?)(?=\\n|$)" + # Remove leading white spaces + - function: remove_whitespace + # function to ignore right white spaces or line breaks + - function: "regex" + regex_pattern: "^(.*?)\\s*$" + - function: take_first metadata: version: 2.0 dataset_kwargs: diff --git a/lm_eval/tasks/mmlu/generative/_mmlu.yaml b/lm_eval/tasks/mmlu/generative/_mmlu.yaml index 1a63611bdb..00bdd8f5cc 100644 --- a/lm_eval/tasks/mmlu/generative/_mmlu.yaml +++ b/lm_eval/tasks/mmlu/generative/_mmlu.yaml @@ -5,29 +5,34 @@ task: task: - mmlu_stem_generative aggregate_metric_list: - - metric: acc + - metric: exact_match weight_by_size: True + filter_list: get_response - group: other task: - mmlu_other_generative aggregate_metric_list: - - metric: acc + - metric: exact_match weight_by_size: True + filter_list: get_response - group: social sciences task: - mmlu_social_sciences_generative aggregate_metric_list: - - metric: acc + - metric: exact_match weight_by_size: True + filter_list: get_response - group: humanities task: - mmlu_humanities_generative aggregate_metric_list: - - metric: acc + - metric: exact_match weight_by_size: True + filter_list: get_response aggregate_metric_list: - aggregation: mean metric: exact_match weight_by_size: True + filter_list: get_response metadata: version: 2