From b2c090cc971e911c62f6f9a848c20cafb1488ec3 Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Wed, 22 Jan 2025 01:48:22 +0900 Subject: [PATCH] aggregate by group (total and categories) (#2643) --- lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml | 3 --- lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard.yaml | 11 +++++++++++ .../cot_hard/_kmmlu_cot_hard_applied_science.yaml | 8 ++++++++ .../tasks/kmmlu/cot_hard/_kmmlu_cot_hard_humss.yaml | 8 ++++++++ .../tasks/kmmlu/cot_hard/_kmmlu_cot_hard_other.yaml | 8 ++++++++ .../tasks/kmmlu/cot_hard/_kmmlu_cot_hard_stem.yaml | 8 ++++++++ .../kmmlu/cot_hard/kmmlu_cot_hard_accounting.yaml | 3 ++- .../kmmlu_cot_hard_agricultural_sciences.yaml | 3 ++- ...cot_hard_aviation_engineering_and_maintenance.yaml | 3 ++- .../tasks/kmmlu/cot_hard/kmmlu_cot_hard_biology.yaml | 3 ++- .../cot_hard/kmmlu_cot_hard_chemical_engineering.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_chemistry.yaml | 3 ++- .../cot_hard/kmmlu_cot_hard_civil_engineering.yaml | 3 ++- .../cot_hard/kmmlu_cot_hard_computer_science.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_construction.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_criminal_law.yaml | 3 ++- .../tasks/kmmlu/cot_hard/kmmlu_cot_hard_ecology.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_economics.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_education.yaml | 3 ++- .../kmmlu_cot_hard_electrical_engineering.yaml | 3 ++- .../kmmlu_cot_hard_electronics_engineering.yaml | 3 ++- .../cot_hard/kmmlu_cot_hard_energy_management.yaml | 3 ++- .../kmmlu_cot_hard_environmental_science.yaml | 3 ++- .../tasks/kmmlu/cot_hard/kmmlu_cot_hard_fashion.yaml | 3 ++- .../cot_hard/kmmlu_cot_hard_food_processing.yaml | 3 ++- ...kmmlu_cot_hard_gas_technology_and_engineering.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_geomatics.yaml | 3 ++- .../tasks/kmmlu/cot_hard/kmmlu_cot_hard_health.yaml | 3 ++- .../cot_hard/kmmlu_cot_hard_industrial_engineer.yaml | 3 ++- .../kmmlu_cot_hard_information_technology.yaml | 3 ++- ...mlu_cot_hard_interior_architecture_and_design.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_korean_history.yaml | 3 ++- lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_law.yaml | 3 ++- ...mlu_cot_hard_machine_design_and_manufacturing.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_management.yaml | 3 ++- .../cot_hard/kmmlu_cot_hard_maritime_engineering.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_marketing.yaml | 3 ++- .../kmmlu_cot_hard_materials_engineering.yaml | 3 ++- lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_math.yaml | 3 ++- .../kmmlu_cot_hard_mechanical_engineering.yaml | 3 ++- .../kmmlu_cot_hard_nondestructive_testing.yaml | 3 ++- .../tasks/kmmlu/cot_hard/kmmlu_cot_hard_patent.yaml | 3 ++- ...mmlu_cot_hard_political_science_and_sociology.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_psychology.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_public_safety.yaml | 3 ++- ...u_cot_hard_railway_and_automotive_engineering.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_real_estate.yaml | 3 ++- .../kmmlu_cot_hard_refrigerating_machinery.yaml | 3 ++- .../kmmlu/cot_hard/kmmlu_cot_hard_social_welfare.yaml | 3 ++- .../tasks/kmmlu/cot_hard/kmmlu_cot_hard_taxation.yaml | 3 ++- ...rd_telecommunications_and_wireless_technology.yaml | 3 ++- lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml | 3 --- lm_eval/tasks/kmmlu/direct/_kmmlu_direct.yaml | 11 +++++++++++ .../kmmlu/direct/_kmmlu_direct_applied_science.yaml | 8 ++++++++ lm_eval/tasks/kmmlu/direct/_kmmlu_direct_humss.yaml | 8 ++++++++ lm_eval/tasks/kmmlu/direct/_kmmlu_direct_other.yaml | 8 ++++++++ lm_eval/tasks/kmmlu/direct/_kmmlu_direct_stem.yaml | 8 ++++++++ .../tasks/kmmlu/direct/kmmlu_direct_accounting.yaml | 1 + .../direct/kmmlu_direct_agricultural_sciences.yaml | 1 + ...u_direct_aviation_engineering_and_maintenance.yaml | 1 + lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml | 1 + .../direct/kmmlu_direct_chemical_engineering.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml | 1 + .../kmmlu/direct/kmmlu_direct_civil_engineering.yaml | 1 + .../kmmlu/direct/kmmlu_direct_computer_science.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_construction.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml | 1 + lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_economics.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_education.yaml | 1 + .../direct/kmmlu_direct_electrical_engineering.yaml | 1 + .../direct/kmmlu_direct_electronics_engineering.yaml | 1 + .../kmmlu/direct/kmmlu_direct_energy_management.yaml | 1 + .../direct/kmmlu_direct_environmental_science.yaml | 1 + lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml | 1 + .../kmmlu/direct/kmmlu_direct_food_processing.yaml | 1 + .../kmmlu_direct_gas_technology_and_engineering.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml | 1 + lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml | 1 + .../direct/kmmlu_direct_industrial_engineer.yaml | 1 + .../direct/kmmlu_direct_information_technology.yaml | 1 + ...kmmlu_direct_interior_architecture_and_design.yaml | 1 + .../kmmlu/direct/kmmlu_direct_korean_history.yaml | 1 + lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml | 1 + ...kmmlu_direct_machine_design_and_manufacturing.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_management.yaml | 1 + .../direct/kmmlu_direct_maritime_engineering.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_marketing.yaml | 1 + .../direct/kmmlu_direct_materials_engineering.yaml | 1 + lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml | 1 + .../direct/kmmlu_direct_mechanical_engineering.yaml | 1 + .../direct/kmmlu_direct_nondestructive_testing.yaml | 1 + lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml | 1 + .../kmmlu_direct_political_science_and_sociology.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_psychology.yaml | 1 + .../kmmlu/direct/kmmlu_direct_public_safety.yaml | 1 + ...mlu_direct_railway_and_automotive_engineering.yaml | 1 + .../tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml | 1 + .../direct/kmmlu_direct_refrigerating_machinery.yaml | 1 + .../kmmlu/direct/kmmlu_direct_social_welfare.yaml | 1 + lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml | 1 + ...ct_telecommunications_and_wireless_technology.yaml | 1 + .../tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml | 3 --- .../tasks/kmmlu/direct_hard/_kmmlu_direct_hard.yaml | 11 +++++++++++ .../_kmmlu_direct_hard_applied_science.yaml | 8 ++++++++ .../kmmlu/direct_hard/_kmmlu_direct_hard_humss.yaml | 8 ++++++++ .../kmmlu/direct_hard/_kmmlu_direct_hard_other.yaml | 8 ++++++++ .../kmmlu/direct_hard/_kmmlu_direct_hard_stem.yaml | 8 ++++++++ .../direct_hard/kmmlu_direct_hard_accounting.yaml | 3 ++- .../kmmlu_direct_hard_agricultural_sciences.yaml | 3 ++- ...ect_hard_aviation_engineering_and_maintenance.yaml | 3 ++- .../kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml | 3 ++- .../kmmlu_direct_hard_chemical_engineering.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_chemistry.yaml | 3 ++- .../kmmlu_direct_hard_civil_engineering.yaml | 3 ++- .../kmmlu_direct_hard_computer_science.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_construction.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_criminal_law.yaml | 3 ++- .../kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_economics.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_education.yaml | 3 ++- .../kmmlu_direct_hard_electrical_engineering.yaml | 3 ++- .../kmmlu_direct_hard_electronics_engineering.yaml | 3 ++- .../kmmlu_direct_hard_energy_management.yaml | 3 ++- .../kmmlu_direct_hard_environmental_science.yaml | 3 ++- .../kmmlu/direct_hard/kmmlu_direct_hard_fashion.yaml | 3 ++- .../kmmlu_direct_hard_food_processing.yaml | 3 ++- ...lu_direct_hard_gas_technology_and_engineering.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_geomatics.yaml | 3 ++- .../kmmlu/direct_hard/kmmlu_direct_hard_health.yaml | 3 ++- .../kmmlu_direct_hard_industrial_engineer.yaml | 3 ++- .../kmmlu_direct_hard_information_technology.yaml | 3 ++- ..._direct_hard_interior_architecture_and_design.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_korean_history.yaml | 3 ++- .../kmmlu/direct_hard/kmmlu_direct_hard_law.yaml | 3 ++- ..._direct_hard_machine_design_and_manufacturing.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_management.yaml | 3 ++- .../kmmlu_direct_hard_maritime_engineering.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_marketing.yaml | 3 ++- .../kmmlu_direct_hard_materials_engineering.yaml | 3 ++- .../kmmlu/direct_hard/kmmlu_direct_hard_math.yaml | 3 ++- .../kmmlu_direct_hard_mechanical_engineering.yaml | 3 ++- .../kmmlu_direct_hard_nondestructive_testing.yaml | 3 ++- .../kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml | 3 ++- ...u_direct_hard_political_science_and_sociology.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_psychology.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_public_safety.yaml | 3 ++- ...irect_hard_railway_and_automotive_engineering.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_real_estate.yaml | 3 ++- .../kmmlu_direct_hard_refrigerating_machinery.yaml | 3 ++- .../direct_hard/kmmlu_direct_hard_social_welfare.yaml | 3 ++- .../kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml | 3 ++- ...rd_telecommunications_and_wireless_technology.yaml | 3 ++- lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml | 6 ------ lm_eval/tasks/kmmlu/hard/_kmmlu_hard.yaml | 11 +++++++++++ .../tasks/kmmlu/hard/_kmmlu_hard_applied_science.yaml | 8 ++++++++ lm_eval/tasks/kmmlu/hard/_kmmlu_hard_humss.yaml | 8 ++++++++ lm_eval/tasks/kmmlu/hard/_kmmlu_hard_other.yaml | 8 ++++++++ lm_eval/tasks/kmmlu/hard/_kmmlu_hard_stem.yaml | 8 ++++++++ lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml | 1 + .../kmmlu/hard/kmmlu_hard_agricultural_sciences.yaml | 1 + ...mlu_hard_aviation_engineering_and_maintenance.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_biology.yaml | 1 + .../kmmlu/hard/kmmlu_hard_chemical_engineering.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemistry.yaml | 1 + .../kmmlu/hard/kmmlu_hard_civil_engineering.yaml | 1 + .../tasks/kmmlu/hard/kmmlu_hard_computer_science.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_construction.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_criminal_law.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_ecology.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_economics.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_education.yaml | 1 + .../kmmlu/hard/kmmlu_hard_electrical_engineering.yaml | 1 + .../hard/kmmlu_hard_electronics_engineering.yaml | 1 + .../kmmlu/hard/kmmlu_hard_energy_management.yaml | 1 + .../kmmlu/hard/kmmlu_hard_environmental_science.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_fashion.yaml | 1 + .../tasks/kmmlu/hard/kmmlu_hard_food_processing.yaml | 1 + .../kmmlu_hard_gas_technology_and_engineering.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_geomatics.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_health.yaml | 1 + .../kmmlu/hard/kmmlu_hard_industrial_engineer.yaml | 1 + .../kmmlu/hard/kmmlu_hard_information_technology.yaml | 1 + .../kmmlu_hard_interior_architecture_and_design.yaml | 1 + .../tasks/kmmlu/hard/kmmlu_hard_korean_history.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_law.yaml | 1 + .../kmmlu_hard_machine_design_and_manufacturing.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_management.yaml | 1 + .../kmmlu/hard/kmmlu_hard_maritime_engineering.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_marketing.yaml | 1 + .../kmmlu/hard/kmmlu_hard_materials_engineering.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_math.yaml | 1 + .../kmmlu/hard/kmmlu_hard_mechanical_engineering.yaml | 1 + .../kmmlu/hard/kmmlu_hard_nondestructive_testing.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_patent.yaml | 1 + .../kmmlu_hard_political_science_and_sociology.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_psychology.yaml | 1 + .../tasks/kmmlu/hard/kmmlu_hard_public_safety.yaml | 1 + ...kmmlu_hard_railway_and_automotive_engineering.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_real_estate.yaml | 1 + .../hard/kmmlu_hard_refrigerating_machinery.yaml | 1 + .../tasks/kmmlu/hard/kmmlu_hard_social_welfare.yaml | 1 + lm_eval/tasks/kmmlu/hard/kmmlu_hard_taxation.yaml | 1 + ...rd_telecommunications_and_wireless_technology.yaml | 1 + 204 files changed, 442 insertions(+), 105 deletions(-) create mode 100644 lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard.yaml create mode 100644 lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_applied_science.yaml create mode 100644 lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_humss.yaml create mode 100644 lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_other.yaml create mode 100644 lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_stem.yaml create mode 100644 lm_eval/tasks/kmmlu/direct/_kmmlu_direct.yaml create mode 100644 lm_eval/tasks/kmmlu/direct/_kmmlu_direct_applied_science.yaml create mode 100644 lm_eval/tasks/kmmlu/direct/_kmmlu_direct_humss.yaml create mode 100644 lm_eval/tasks/kmmlu/direct/_kmmlu_direct_other.yaml create mode 100644 lm_eval/tasks/kmmlu/direct/_kmmlu_direct_stem.yaml create mode 100644 lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard.yaml create mode 100644 lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_applied_science.yaml create mode 100644 lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_humss.yaml create mode 100644 lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_other.yaml create mode 100644 lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_stem.yaml create mode 100644 lm_eval/tasks/kmmlu/hard/_kmmlu_hard.yaml create mode 100644 lm_eval/tasks/kmmlu/hard/_kmmlu_hard_applied_science.yaml create mode 100644 lm_eval/tasks/kmmlu/hard/_kmmlu_hard_humss.yaml create mode 100644 lm_eval/tasks/kmmlu/hard/_kmmlu_hard_other.yaml create mode 100644 lm_eval/tasks/kmmlu/hard/_kmmlu_hard_stem.yaml diff --git a/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml b/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml index 163a03dfd2..0c0fadf735 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml @@ -1,6 +1,3 @@ -tag: - - kmmlu - - kmmlu_hard_cot dataset_path: HAERAE-HUB/KMMLU-HARD output_type: generate_until validation_split: dev # not meant to be used, only here to silence warnings diff --git a/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard.yaml b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard.yaml new file mode 100644 index 0000000000..1e459a05d6 --- /dev/null +++ b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard.yaml @@ -0,0 +1,11 @@ +group: kmmlu_cot_hard +task: + - kmmlu_cot_hard_stem + - kmmlu_cot_hard_other + - kmmlu_cot_hard_applied_science + - kmmlu_cot_hard_humss +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_applied_science.yaml b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_applied_science.yaml new file mode 100644 index 0000000000..4944cefb60 --- /dev/null +++ b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_applied_science.yaml @@ -0,0 +1,8 @@ +group: kmmlu_cot_hard_applied_science +task: + - kmmlu_cot_hard_applied_science_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_humss.yaml b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_humss.yaml new file mode 100644 index 0000000000..7b30f3588d --- /dev/null +++ b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_humss.yaml @@ -0,0 +1,8 @@ +group: kmmlu_cot_hard_humss +task: + - kmmlu_cot_hard_humss_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_other.yaml b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_other.yaml new file mode 100644 index 0000000000..70329cf494 --- /dev/null +++ b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_other.yaml @@ -0,0 +1,8 @@ +group: kmmlu_cot_hard_other +task: + - kmmlu_cot_hard_other_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_stem.yaml b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_stem.yaml new file mode 100644 index 0000000000..65d92fe270 --- /dev/null +++ b/lm_eval/tasks/kmmlu/cot_hard/_kmmlu_cot_hard_stem.yaml @@ -0,0 +1,8 @@ +group: kmmlu_cot_hard_stem +task: + - kmmlu_cot_hard_stem_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_accounting.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_accounting.yaml index bb17436e43..0a89dce5e4 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_accounting.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_accounting.yaml @@ -78,4 +78,5 @@ fewshot_config: 당기순이익은 과소 계상됩니다. 왜냐하면 매출원가가 더 높아지면 이익은 줄어들기 때문입니다. , 상품재고액을 과대 계상한 경우 매출원가는 과대 계상되고, 당기순이익은 과소 계상됩니다. '따라서, 정답은 (A) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_accounting +task: kmmlu_cot_hard_accounting +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_agricultural_sciences.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_agricultural_sciences.yaml index b100094b57..d3ab573490 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_agricultural_sciences.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_agricultural_sciences.yaml @@ -80,4 +80,5 @@ fewshot_config: 각 선택지를 분석한 결과 (C) 선택지인 '감자의 바이러스 병을 막기 위해 평지에서 채종한다.'가 가장 잘못된 방법으로 보입니다. 이는 감자의 바이러스 병 예방과 평지에서의 채종 사이에 직접적인 연관성이 없기 때문입니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_agricultural_sciences +task: kmmlu_cot_hard_agricultural_sciences +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_aviation_engineering_and_maintenance.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_aviation_engineering_and_maintenance.yaml index f9cd217f7b..dcc59f889f 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_aviation_engineering_and_maintenance.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_aviation_engineering_and_maintenance.yaml @@ -85,4 +85,5 @@ fewshot_config: (D) 옆놀이의 안정성 향상을 위해서는 트위스트가 중요한 역할을 합니다. 트위스트는 날개 팁 부분의 각도를 조절하여, 항공기가 고속에서도 안정적으로 비행할 수 있도록 돕습니다. 따라서, 정답은 (D) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_aviation_engineering_and_maintenance +task: kmmlu_cot_hard_aviation_engineering_and_maintenance +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_biology.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_biology.yaml index 4d6e52b77c..52e0c77d83 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_biology.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_biology.yaml @@ -80,4 +80,5 @@ fewshot_config: 없어야 합니다. 이러한 조건을 충족하는 미생물은 절대호산성 미생물입니다. 절대호산성 미생물은 극도로 산성 환경에서만 생존할 수 있으며, 중성 또는 알칼리성 환경에서는 성장할 수 없습니다. 따라서, 정답은 (A) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_biology +task: kmmlu_cot_hard_biology +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemical_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemical_engineering.yaml index 9b7435d3f5..49ebe86600 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemical_engineering.yaml @@ -87,4 +87,5 @@ fewshot_config: 압력, V는 부피입니다. W = -P1Vln(P2/P1) = -(10×10^5 Pa)(0.05m^3)ln((1×10^5 Pa)/(10×10^5 Pa)) = 0입니다. 따라서, 정답은 (A) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_chemical_engineering +task: kmmlu_cot_hard_chemical_engineering +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemistry.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemistry.yaml index d761f5e22f..0cfd1dff14 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemistry.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemistry.yaml @@ -76,4 +76,5 @@ fewshot_config: 황산의 분자량은 98g/mol입니다. 황산의 몰 수는 49g ÷ 98g/mol = 0.5mol입니다. 이 수용액의 물 농도는 0.5mol/1L = 0.5M입니다. 따라서, 정답은 (A) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_chemistry +task: kmmlu_cot_hard_chemistry +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_civil_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_civil_engineering.yaml index 87d3d22e5a..13893796b0 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_civil_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_civil_engineering.yaml @@ -97,4 +97,5 @@ fewshot_config: 것이며, 이 계약은 미국의 근대도시계획 성립기에 지역제의 바탕이 된 제도는 (A) 협약(covenant)이 가장 적절한 선택입니다. 따라서, 정답은 (A) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_civil_engineering +task: kmmlu_cot_hard_civil_engineering +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_computer_science.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_computer_science.yaml index 463b8e75b3..f83994093a 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_computer_science.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_computer_science.yaml @@ -96,4 +96,5 @@ fewshot_config: 주어진 설명에서 언급된 감사 추적(Auditing)이나 Shadow Password와 같은 부가적인 기능보다는 사용자 간 침범 차단과 사용자별 파일 권한 설정에 초점을 맞춘 것으로 정의됩니다. 따라서, 정답은 (B) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_computer_science +task: kmmlu_cot_hard_computer_science +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_construction.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_construction.yaml index a277f637fa..3cfb3e9f89 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_construction.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_construction.yaml @@ -83,4 +83,5 @@ fewshot_config: 압축비가 9입니다. 이를 식에 대입하여 연소실 체적을 계산해 보겠습니다. 행정체적 = 240 압축비 = 9 연소실_체적 = 행정체적 / (압축비 - 1) = 240 / 8 = 30 연소실의 체적은 30cc입니다. 따라서, 정답은 (B) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_construction +task: kmmlu_cot_hard_construction +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_criminal_law.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_criminal_law.yaml index fa46f0f45b..559ff679f9 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_criminal_law.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_criminal_law.yaml @@ -106,4 +106,5 @@ fewshot_config: 고려에 따라 변경된 경우에 형법 제1조 제2항이 적용되는 것은 맞지만, 법령의 변경이 있더라도 그것이 반성적 고려에서 비롯된 것이 아니라면 형법 제1조 제2항이 적용되지 않습니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_criminal_law +task: kmmlu_cot_hard_criminal_law +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_ecology.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_ecology.yaml index 0da8a9a734..0d59eaa7af 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_ecology.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_ecology.yaml @@ -88,4 +88,5 @@ fewshot_config: 질소는 질소 가스나 암모니아의 형태로 존재합니다. , 위의 분석에 따르면, (B) 옵션의 설명이 잘못되었습니다. 아연과 코발트는 일반적으로 미량원소로 분류되지만, 이 설명에서는 그들을 다량 원소로 잘못 분류하고 있습니다. 따라서, 정답은 (B) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_ecology +task: kmmlu_cot_hard_ecology +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_economics.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_economics.yaml index baaa3bd980..838e0bd5e7 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_economics.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_economics.yaml @@ -87,4 +87,5 @@ fewshot_config: - 7,000,000원)은 어떻게 처리되었을까요? 이 금액은 회사의 이익에서 차감되어야 합니다. 이는 이익잉여금, 즉 회사가 벌어들인 이익에서 퇴직금 등을 지급한 후 남은 금액이 감소한다는 의미입니다. 선택지 D는 옳지 않습니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_economics +task: kmmlu_cot_hard_economics +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_education.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_education.yaml index 0e68a0b06d..899c38a803 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_education.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_education.yaml @@ -81,4 +81,5 @@ fewshot_config: 수 있습니다. 하지만, 평생교육법에는 전국평생학습도시협의회의 구성 및 운영에 필요한 사항은 교육부령으로 정한다는 내용은 없습니다. 이를 통해 선택지 (B)가 평생교육법 상평생학습도시에 대한 설명으로 옳지 않다는 것을 알 수 있습니다. 따라서, 정답은 (B) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_education +task: kmmlu_cot_hard_education +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electrical_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electrical_engineering.yaml index 3b5af8b76d..f68f2ca73a 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electrical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electrical_engineering.yaml @@ -93,4 +93,5 @@ fewshot_config: Pyrometer)입니다. 이는 흑체 또는 비슷한 조건의 물체로부터 방사되는 가시광선의 강도를 통해 온도를 측정하는 방식을 사용하며, 흑체의 방사율을 1로 설정하여 보정하는 원리를 기반으로 합니다. 따라서, 정답은 (A) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_electrical_engineering +task: kmmlu_cot_hard_electrical_engineering +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electronics_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electronics_engineering.yaml index 4d1327034f..ce9ce0e36a 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electronics_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electronics_engineering.yaml @@ -79,4 +79,5 @@ fewshot_config: 합니다. 마지막으로 선택지 (D)는 컴퓨터 시스템의 하드웨어 오류를 발견하고 그에 대한 적절한 조치를 한다는 내용입니다. 이 역시 운영체제의 기능으로, 하드웨어 오류를 감지하고 적절한 처리를 하여 시스템의 안정성을 유지하는 역할을 합니다. 따라서, 정답은 (A) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_electronics_engineering +task: kmmlu_cot_hard_electronics_engineering +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_energy_management.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_energy_management.yaml index 8dca183aa9..0c5e18b1e9 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_energy_management.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_energy_management.yaml @@ -85,4 +85,5 @@ fewshot_config: 요인이 아닙니다. , 태양광발전 모듈의 I-V 특성곡선에서 일사량에 따라 가장 많이 변화하는 것은 전류입니다. 따라서, 정답은 (B) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_energy_management +task: kmmlu_cot_hard_energy_management +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_environmental_science.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_environmental_science.yaml index d9080b078a..47de0dca4d 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_environmental_science.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_environmental_science.yaml @@ -82,4 +82,5 @@ fewshot_config: 전통적인 구성요소는 아닙니다. 과정분석은 보다 일반적인 용어로, 다양한 맥락에서 사용될 수 있습니다. (D) 목록분석 (Inventory Analysis): 이 역시 LCA의 핵심 단계 중 하나입니다. 따라서, 정답은 (C) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_environmental_science +task: kmmlu_cot_hard_environmental_science +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_fashion.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_fashion.yaml index 983a6590a8..598aad051f 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_fashion.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_fashion.yaml @@ -84,4 +84,5 @@ fewshot_config: 수선 등을 포함한 종합적인 서비스를 제공하는 것으로 보입니다. 이는 일반적인 클리닝 서비스와는 차별화된 서비스라고 볼 수 있습니다. 따라서, 정답은 (D) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_fashion +task: kmmlu_cot_hard_fashion +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_food_processing.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_food_processing.yaml index 2d3473f07a..3cbec3d867 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_food_processing.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_food_processing.yaml @@ -78,4 +78,5 @@ fewshot_config: 이 품종은 상대적으로 높은 온도에 더 민감하게 반응하며, 일장의 변화에는 덜 민감한 특성을 가지고 있어 한국의 기후 특성에서 효과적으로 성장할 수 있는 조건을 가지고 있습니다. 따라서, 정답은 (D) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_food_processing +task: kmmlu_cot_hard_food_processing +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_gas_technology_and_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_gas_technology_and_engineering.yaml index a244b95500..49551077b7 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_gas_technology_and_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_gas_technology_and_engineering.yaml @@ -85,4 +85,5 @@ fewshot_config: 이들은 모두 환경에 해롭습니다. 물은 염소 가스의 재해 방지용으로서의 흡수제나 재해제로서 적합하지 않습니다. 따라서, 정답은 (D) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_gas_technology_and_engineering +task: kmmlu_cot_hard_gas_technology_and_engineering +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_geomatics.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_geomatics.yaml index cfc4866a62..961b20cefa 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_geomatics.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_geomatics.yaml @@ -75,4 +75,5 @@ fewshot_config: 공식은 실제 거리의 제곱근에 축척분모를 곱한 값이 측정된 면적이 될 것입니다. 이렇게 보면, 공식이 의미하는 바를 잘 나타내는 것 같습니다. 따라서, 정답은 (B) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_geomatics +task: kmmlu_cot_hard_geomatics +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_health.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_health.yaml index d5b477911d..29a432e4f8 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_health.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_health.yaml @@ -85,4 +85,5 @@ fewshot_config: 일반적으로는 사업장에서 자체적으로 실시하는 것이 일반적입니다. , 국민건강증진기금의 사용 범위에는 포함되지 않을 수 있습니다. 따라서, 정답은 (D) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_health +task: kmmlu_cot_hard_health +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_industrial_engineer.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_industrial_engineer.yaml index fc8f3dec1a..f087d221a7 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_industrial_engineer.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_industrial_engineer.yaml @@ -82,4 +82,5 @@ fewshot_config: 절삭저항의 대부분을 차지합니다. 이러한 정보를 바탕으로, 주분력이 절삭저항의 대부분을 차지하므로, 탄소강을 가공할 때 가장 큰 절삭저항을 주는 것은 주분력일 것이라고 추론할 수 있습니다. 따라서, 정답은 (D) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_industrial_engineer +task: kmmlu_cot_hard_industrial_engineer +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_information_technology.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_information_technology.yaml index aa557282f0..3d6fd3ce59 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_information_technology.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_information_technology.yaml @@ -80,4 +80,5 @@ fewshot_config: 피어스 B-E형 발진 회로에서는 컬렉터-이미터 간의 임피던스가 유도성일 때 가장 안정한 발진을 지속할 수 있습니다. 따라서, 정답은 (B) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_information_technology +task: kmmlu_cot_hard_information_technology +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_interior_architecture_and_design.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_interior_architecture_and_design.yaml index 2b3849d79c..84a61a7009 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_interior_architecture_and_design.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_interior_architecture_and_design.yaml @@ -92,4 +92,5 @@ fewshot_config: 지칭하지 않으며, 실제 설계 및 계획 과정에서는 보통 최소값, 최대값, 또는 목표값과 같이 더 구체적이고 명확한 기준을 바탕으로 최적치수를 결정합니다. 따라서, 정답은 (C) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_interior_architecture_and_design +task: kmmlu_cot_hard_interior_architecture_and_design +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_korean_history.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_korean_history.yaml index 7498080c53..18666a70c3 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_korean_history.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_korean_history.yaml @@ -97,4 +97,5 @@ fewshot_config: 군사 기구로, 흥선대원군은 왕권 강화를 위해 비변사의 기능을 약화시켰습니다. (D) 통상 수교 거부 정책을 추진하였다 - 흥선대원군은 외세의 침략을 막기 위해 통상 수교 거부 정책을 추진하였습니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_korean_history +task: kmmlu_cot_hard_korean_history +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_law.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_law.yaml index 0328a01030..ddbd97b67e 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_law.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_law.yaml @@ -81,4 +81,5 @@ fewshot_config: 중 하나입니다. (D) 네트워크 취약성으로 발생하는 문제는 물리적 통제절차의 개선으로 해결해야 한다는 것은, 네트워크 보안 문제를 해결하기 위해 물리적인 통제 절차를 개선하는 것입니다. 이는 네트워크 보안을 강화하는 데 매우 중요한 역할을 합니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_law +task: kmmlu_cot_hard_law +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_machine_design_and_manufacturing.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_machine_design_and_manufacturing.yaml index 4c6207bb29..d1e0d88bba 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_machine_design_and_manufacturing.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_machine_design_and_manufacturing.yaml @@ -83,4 +83,5 @@ fewshot_config: 선택지는 해칭이 주된 중심선 또는 단면도의 주된 외형선에 대하여 90℃ 기울기로 그린다는 내용인데, 이는 잘못된 내용입니다. 일반적으로 해칭은 45도 기울기로 그려집니다. , 이 선택지는 해칭의 일반적인 원칙을 잘못 설명하고 있습니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_machine_design_and_manufacturing +task: kmmlu_cot_hard_machine_design_and_manufacturing +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_management.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_management.yaml index 116289043d..435d762fac 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_management.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_management.yaml @@ -76,4 +76,5 @@ fewshot_config: 각 부문별로 목표를 정하고 분산된 시스템을 구축하는 것은 물류 시스템의 효율성을 높일 수 있지만, 이는 통합적인 관리가 어려울 수 있습니다. 따라서, 정답은 (B) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_management +task: kmmlu_cot_hard_management +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_maritime_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_maritime_engineering.yaml index e168371f2d..bb7103eb66 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_maritime_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_maritime_engineering.yaml @@ -98,4 +98,5 @@ fewshot_config: (D) 아르곤: 아르곤도 불활성 기체로, 지방질에 용해되거나 마취 효과를 나타내지 않습니다. 아르곤은 주로 산업 공정에서 보호 가스로 사용됩니다. 따라서, 정답은 (B) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_maritime_engineering +task: kmmlu_cot_hard_maritime_engineering +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_marketing.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_marketing.yaml index 240d92a2f0..971a106b24 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_marketing.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_marketing.yaml @@ -91,4 +91,5 @@ fewshot_config: 있으며, 상담원이 고객의 반론에 대한 자연스러운 대응력을 갖추면 고객의 불만이나 반대를 효과적으로 처리할 수 있습니다. 따라서, 정답은 (A) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_marketing +task: kmmlu_cot_hard_marketing +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_materials_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_materials_engineering.yaml index 59774a1543..4f5867e258 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_materials_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_materials_engineering.yaml @@ -84,4 +84,5 @@ fewshot_config: 구별하는 데 사용될 수 있습니다. 냉간가공은 재결성 온도 이하에서 이루어지며, 열간가공은 재결성 온도 이상에서 이루어집니다. , 냉간가공과 열간가공을 구별하는 기준은 재결성 온도라고 할 수 있습니다. 따라서, 정답은 (C) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_materials_engineering +task: kmmlu_cot_hard_materials_engineering +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_math.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_math.yaml index 103bc573e7..5aa474d2a4 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_math.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_math.yaml @@ -95,4 +95,5 @@ fewshot_config: + ω2019 입니다. , ω^2017 + ω^2019 = ω + 1 입니다. 주어진 식에 ω + 1을 대입하면 ω + 1 + ω + 1 + 1 + 1이 됩니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_math +task: kmmlu_cot_hard_math +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_mechanical_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_mechanical_engineering.yaml index a57d066138..8d99ba72b1 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_mechanical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_mechanical_engineering.yaml @@ -77,4 +77,5 @@ fewshot_config: 어떤 것일까요? V벨트의 단면 크기는 알파벳이 뒤로 갈수록 커집니다 즉, A형은 B형보다 작고, B형은 C형보다 작으며, 이런 식으로 D형, E형으로 진행됩니다. , 주어진 선택지 중에서 가장 단면이 큰 V벨트는 E형일 것입니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_mechanical_engineering +task: kmmlu_cot_hard_mechanical_engineering +tag: kmmlu_cot_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_nondestructive_testing.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_nondestructive_testing.yaml index c7ecea1725..656b08accb 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_nondestructive_testing.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_nondestructive_testing.yaml @@ -91,4 +91,5 @@ fewshot_config: 시험체의 두께 t를 계산하면 다음과 같습니다. t = v / (2f) = 4800 / (2 * 2 * 10^6) = 0.0012m = 1.2mm 따라서, 정답은 (A) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_nondestructive_testing +task: kmmlu_cot_hard_nondestructive_testing +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_patent.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_patent.yaml index 1e5607a5c0..30b6082590 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_patent.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_patent.yaml @@ -110,4 +110,5 @@ fewshot_config: 발명에 대해서는 먼저 출원한 자만이 특허를 받을 수 있다고 규정하고 있으므로, 乙은 특허를 받을 수 없습니다. , (D)는 옳은 설명입니다. 따라서, 정답은 (A) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_patent +task: kmmlu_cot_hard_patent +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_political_science_and_sociology.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_political_science_and_sociology.yaml index 50c159f947..7d8c4e56e6 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_political_science_and_sociology.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_political_science_and_sociology.yaml @@ -88,4 +88,5 @@ fewshot_config: 범위에서도 활용되는 전략입니다. 도시의 이미지를 국제적으로 홍보하고, 외국인 투자자나 관광객을 유치하는 것이 도시마케팅의 일부이기 때문입니다. 도시마케팅의 공간적 범위가 국내로만 한정되어 있다는 것은 잘못된 설명입니다. 따라서, 정답은 (D) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_political_science_and_sociology +task: kmmlu_cot_hard_political_science_and_sociology +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_psychology.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_psychology.yaml index f86d14e68f..125befe11f 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_psychology.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_psychology.yaml @@ -95,4 +95,5 @@ fewshot_config: 이러한 분석을 통해 고급 상담자의 특징은 (C) 내담자에게 의도적으로 주의를 기울이고 중요한 정보를 수집하고 인식할 수 있다는 것으로 보입니다. 이는 상담자의 기본적인 역량을 넘어서서 고급 상담자가 갖추어야 할 능력으로 보입니다. 따라서, 정답은 (C) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_psychology +task: kmmlu_cot_hard_psychology +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_public_safety.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_public_safety.yaml index 5cc5c148e8..5627770be0 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_public_safety.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_public_safety.yaml @@ -90,4 +90,5 @@ fewshot_config: 산업안전ᆞ보건과 관련된 그 밖의 사항 , 선택지 중에서 산업안전보건위원회의 심의ᆞ의결을 거치지 않아도 되는 사항은 (B) 안전ᆞ보건과 관련된 안전장치 구입 시의 적격품 여부 확인에 관한 사항입니다. 따라서, 정답은 (B) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_public_safety +task: kmmlu_cot_hard_public_safety +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_railway_and_automotive_engineering.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_railway_and_automotive_engineering.yaml index c81e158a08..5b8b436fbd 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_railway_and_automotive_engineering.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_railway_and_automotive_engineering.yaml @@ -85,4 +85,5 @@ fewshot_config: 위한 것입니다. (D) 기관의 과냉 및 소음방지를 위해 일정 회전수 이상 시 슬립 발생: 유체 커플링식 냉각 팬은 기관의 과냉 및 소음 방지를 위해 일정 회전수 이상 시 슬립이 발생합니다. 이는 유체 커플링의 특성 때문입니다. 따라서, 정답은 (A) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_railway_and_automotive_engineering +task: kmmlu_cot_hard_railway_and_automotive_engineering +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_real_estate.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_real_estate.yaml index 0e7d81001b..38df431259 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_real_estate.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_real_estate.yaml @@ -88,4 +88,5 @@ fewshot_config: 따르면 개업공인중개사는 등록한 관할구역 외의 지역에 있는 중개대상물을 중개할 수 있습니다. 이 내용은 잘못된 내용입니다. 따라서, 정답은 (D) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_real_estate +task: kmmlu_cot_hard_real_estate +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_refrigerating_machinery.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_refrigerating_machinery.yaml index 7c3984e4b9..10624f2afa 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_refrigerating_machinery.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_refrigerating_machinery.yaml @@ -88,4 +88,5 @@ fewshot_config: = 200.15K입니다. 그러므로, W = 1kJ * (300.15K - 200.15K) / 200.15K = 0.5kJ입니다. 따라서, 정답은 (D) 입니다.' include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_refrigerating_machinery +task: kmmlu_cot_hard_refrigerating_machinery +tag: kmmlu_cot_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_social_welfare.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_social_welfare.yaml index d19fb51108..64e6fb5a70 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_social_welfare.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_social_welfare.yaml @@ -90,4 +90,5 @@ fewshot_config: 이는 사회복지정책의 본질적인 목표와 원칙을 반영하지 못하고 있습니다. 사회복지정책은 능력이 아닌 필요에 따라 지원을 하는 것이 원칙이며, 이를 통해 사회적 불평등을 해소하고 모든 사람이 기본적인 생활을 유지할 수 있도록 지원하는 것이 목표입니다. 따라서, 정답은 (B) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_social_welfare +task: kmmlu_cot_hard_social_welfare +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_taxation.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_taxation.yaml index 937a864e21..fbf880678f 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_taxation.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_taxation.yaml @@ -104,4 +104,5 @@ fewshot_config: 국가의 안전보장 목적의 수행상 긴요하다고 인정하여 수입하는 물품을 의미합니다. 이 또한 국가의 안전보장을 위해 필요한 물품이므로 면세 대상에 해당할 것으로 보입니다. 따라서, 정답은 (A) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_taxation +task: kmmlu_cot_hard_taxation +tag: kmmlu_cot_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_telecommunications_and_wireless_technology.yaml b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_telecommunications_and_wireless_technology.yaml index ca23afc0bb..54c5aac8bf 100644 --- a/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_telecommunications_and_wireless_technology.yaml +++ b/lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_telecommunications_and_wireless_technology.yaml @@ -83,4 +83,5 @@ fewshot_config: 증가하면, 전자기파의 세기는 1/r^2배 감소합니다. , 거리가 2배가 되면, 전자기파의 세기는 1/4배가 됩니다. 그리고 전력 밀도는 전기장과 자기장의 제곱에 비례하므로, 거리가 2배가 되면 전력 밀도는 1/4배가 됩니다. 따라서, 정답은 (D) 입니다. include: _cot_kmmlu_yaml -task: kmmlu_hard_cot_telecommunications_and_wireless_technology +task: kmmlu_cot_hard_telecommunications_and_wireless_technology +tag: kmmlu_cot_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml b/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml index a0c8dfdc7e..1ecb5fbab3 100644 --- a/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml +++ b/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml @@ -1,6 +1,3 @@ -tag: - - kmmlu - - kmmlu_direct dataset_path: HAERAE-HUB/KMMLU output_type: generate_until test_split: test diff --git a/lm_eval/tasks/kmmlu/direct/_kmmlu_direct.yaml b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct.yaml new file mode 100644 index 0000000000..9763d3d4d9 --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct.yaml @@ -0,0 +1,11 @@ +group: kmmlu_direct +task: + - kmmlu_direct_stem + - kmmlu_direct_other + - kmmlu_direct_applied_science + - kmmlu_direct_humss +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_applied_science.yaml b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_applied_science.yaml new file mode 100644 index 0000000000..78937b3fac --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_applied_science.yaml @@ -0,0 +1,8 @@ +group: kmmlu_direct_applied_science +task: + - kmmlu_direct_applied_science_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_humss.yaml b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_humss.yaml new file mode 100644 index 0000000000..1c8e4f206c --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_humss.yaml @@ -0,0 +1,8 @@ +group: kmmlu_direct_humss +task: + - kmmlu_direct_humss_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_other.yaml b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_other.yaml new file mode 100644 index 0000000000..eb5166ec76 --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_other.yaml @@ -0,0 +1,8 @@ +group: kmmlu_direct_other +task: + - kmmlu_direct_other_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_stem.yaml b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_stem.yaml new file mode 100644 index 0000000000..932cc1e579 --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct/_kmmlu_direct_stem.yaml @@ -0,0 +1,8 @@ +group: kmmlu_direct_stem +task: + - kmmlu_direct_stem_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml index d7736e8d5b..d61a84b85d 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml @@ -1,3 +1,4 @@ dataset_name: Accounting include: _direct_kmmlu_yaml task: kmmlu_direct_accounting +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml index 5bf1fa4b56..a8a2829bb5 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml @@ -1,3 +1,4 @@ dataset_name: Agricultural-Sciences include: _direct_kmmlu_yaml task: kmmlu_direct_agricultural_sciences +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml index a9a621931a..d383834ffa 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml @@ -1,3 +1,4 @@ dataset_name: Aviation-Engineering-and-Maintenance include: _direct_kmmlu_yaml task: kmmlu_direct_aviation_engineering_and_maintenance +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml index ebe1765b34..aeeb1e520f 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml @@ -1,3 +1,4 @@ dataset_name: Biology include: _direct_kmmlu_yaml task: kmmlu_direct_biology +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml index e5875bb7e8..921073d5cd 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Chemical-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_chemical_engineering +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml index edabfb67dd..afa5b4b2d6 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml @@ -1,3 +1,4 @@ dataset_name: Chemistry include: _direct_kmmlu_yaml task: kmmlu_direct_chemistry +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml index 98ed98dd2c..b8c5064b93 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Civil-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_civil_engineering +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml index c546e738d6..bac82f1f45 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml @@ -1,3 +1,4 @@ dataset_name: Computer-Science include: _direct_kmmlu_yaml task: kmmlu_direct_computer_science +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml index a0af2a16cf..8cb9ada9c2 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml @@ -1,3 +1,4 @@ dataset_name: Construction include: _direct_kmmlu_yaml task: kmmlu_direct_construction +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml index 9dfdfabc59..642a88bc14 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml @@ -1,3 +1,4 @@ dataset_name: Criminal-Law include: _direct_kmmlu_yaml task: kmmlu_direct_criminal_law +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml index 9d182903e2..dffbb3c49f 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml @@ -1,3 +1,4 @@ dataset_name: Ecology include: _direct_kmmlu_yaml task: kmmlu_direct_ecology +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml index db4d78405a..1fc5d2c3b8 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml @@ -1,3 +1,4 @@ dataset_name: Economics include: _direct_kmmlu_yaml task: kmmlu_direct_economics +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml index 74887e76f3..dc151c8744 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml @@ -1,3 +1,4 @@ dataset_name: Education include: _direct_kmmlu_yaml task: kmmlu_direct_education +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml index 3455d50715..208e7b165d 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Electrical-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_electrical_engineering +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml index b45aa3083c..0a61e3d1a3 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Electronics-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_electronics_engineering +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml index b4fb806b38..085f4246ea 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml @@ -1,3 +1,4 @@ dataset_name: Energy-Management include: _direct_kmmlu_yaml task: kmmlu_direct_energy_management +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml index 1670ff16ba..104a4b9ed9 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml @@ -1,3 +1,4 @@ dataset_name: Environmental-Science include: _direct_kmmlu_yaml task: kmmlu_direct_environmental_science +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml index aef8043aa4..561e565c7b 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml @@ -1,3 +1,4 @@ dataset_name: Fashion include: _direct_kmmlu_yaml task: kmmlu_direct_fashion +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml index f49b087fc2..3050c82aa6 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml @@ -1,3 +1,4 @@ dataset_name: Food-Processing include: _direct_kmmlu_yaml task: kmmlu_direct_food_processing +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml index 00b7021c5c..708e76d875 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Gas-Technology-and-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_gas_technology_and_engineering +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml index 5d8dc70db5..0937bcfc0f 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml @@ -1,3 +1,4 @@ dataset_name: Geomatics include: _direct_kmmlu_yaml task: kmmlu_direct_geomatics +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml index 3f0d77eb78..70ef573668 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml @@ -1,3 +1,4 @@ dataset_name: Health include: _direct_kmmlu_yaml task: kmmlu_direct_health +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml index 39ea0bcf05..1454520195 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml @@ -1,3 +1,4 @@ dataset_name: Industrial-Engineer include: _direct_kmmlu_yaml task: kmmlu_direct_industrial_engineer +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml index c42e80eda1..50fc6e91f0 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml @@ -1,3 +1,4 @@ dataset_name: Information-Technology include: _direct_kmmlu_yaml task: kmmlu_direct_information_technology +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml index 842534aa0a..638de43450 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml @@ -1,3 +1,4 @@ dataset_name: Interior-Architecture-and-Design include: _direct_kmmlu_yaml task: kmmlu_direct_interior_architecture_and_design +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml index f1aa277a70..6d6b20ba10 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml @@ -1,3 +1,4 @@ dataset_name: Korean-History include: _direct_kmmlu_yaml task: kmmlu_direct_korean_history +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml index 602f8982f6..2968585272 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml @@ -1,3 +1,4 @@ dataset_name: Law include: _direct_kmmlu_yaml task: kmmlu_direct_law +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml index bfb923c2a9..587d25d0e4 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml @@ -1,3 +1,4 @@ dataset_name: Machine-Design-and-Manufacturing include: _direct_kmmlu_yaml task: kmmlu_direct_machine_design_and_manufacturing +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml index 7352a1360b..aec441bb02 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml @@ -1,3 +1,4 @@ dataset_name: Management include: _direct_kmmlu_yaml task: kmmlu_direct_management +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml index fa0c8f319f..e7e1f12e7f 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Maritime-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_maritime_engineering +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml index c3b524d853..10dadc0084 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml @@ -1,3 +1,4 @@ dataset_name: Marketing include: _direct_kmmlu_yaml task: kmmlu_direct_marketing +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml index f04e0975a0..d04632665b 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Materials-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_materials_engineering +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml index 6c5d28af05..20d17c01db 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml @@ -1,3 +1,4 @@ dataset_name: Math include: _direct_kmmlu_yaml task: kmmlu_direct_math +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml index a253535adb..3ddb279638 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Mechanical-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_mechanical_engineering +tag: kmmlu_direct_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml index 3b8dc7e784..3e37bd1c1c 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml @@ -1,3 +1,4 @@ dataset_name: Nondestructive-Testing include: _direct_kmmlu_yaml task: kmmlu_direct_nondestructive_testing +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml index 2afff2c373..e829b99583 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml @@ -1,3 +1,4 @@ dataset_name: Patent include: _direct_kmmlu_yaml task: kmmlu_direct_patent +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml index 2209abbf05..adf6c1b7f2 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml @@ -1,3 +1,4 @@ dataset_name: Political-Science-and-Sociology include: _direct_kmmlu_yaml task: kmmlu_direct_political_science_and_sociology +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml index 140302d01f..a8ccfcbd25 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml @@ -1,3 +1,4 @@ dataset_name: Psychology include: _direct_kmmlu_yaml task: kmmlu_direct_psychology +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml index 5bb16a90d1..5926a45c96 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml @@ -1,3 +1,4 @@ dataset_name: Public-Safety include: _direct_kmmlu_yaml task: kmmlu_direct_public_safety +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml index 2a13204a23..fa92c9fb80 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: Railway-and-Automotive-Engineering include: _direct_kmmlu_yaml task: kmmlu_direct_railway_and_automotive_engineering +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml index 5a5202b65d..e8872a5303 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml @@ -1,3 +1,4 @@ dataset_name: Real-Estate include: _direct_kmmlu_yaml task: kmmlu_direct_real_estate +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml index 44f9e428bb..7378739041 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml @@ -1,3 +1,4 @@ dataset_name: Refrigerating-Machinery include: _direct_kmmlu_yaml task: kmmlu_direct_refrigerating_machinery +tag: kmmlu_direct_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml index fa13bdff6a..52f731fb37 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml @@ -1,3 +1,4 @@ dataset_name: Social-Welfare include: _direct_kmmlu_yaml task: kmmlu_direct_social_welfare +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml index 69e71d6dfa..caa0d79841 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml @@ -1,3 +1,4 @@ dataset_name: Taxation include: _direct_kmmlu_yaml task: kmmlu_direct_taxation +tag: kmmlu_direct_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml index f4d1fd05c8..8f98b1d498 100644 --- a/lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml +++ b/lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml @@ -1,3 +1,4 @@ dataset_name: Telecommunications-and-Wireless-Technology include: _direct_kmmlu_yaml task: kmmlu_direct_telecommunications_and_wireless_technology +tag: kmmlu_direct_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml b/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml index 3cf6359206..f5ed0fda26 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml @@ -1,6 +1,3 @@ -tag: - - kmmlu - - kmmlu_hard_direct dataset_path: HAERAE-HUB/KMMLU-HARD output_type: generate_until test_split: test diff --git a/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard.yaml b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard.yaml new file mode 100644 index 0000000000..54206cdb77 --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard.yaml @@ -0,0 +1,11 @@ +group: kmmlu_direct_hard +task: + - kmmlu_direct_hard_stem + - kmmlu_direct_hard_other + - kmmlu_direct_hard_applied_science + - kmmlu_direct_hard_humss +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_applied_science.yaml b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_applied_science.yaml new file mode 100644 index 0000000000..0f70ae139d --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_applied_science.yaml @@ -0,0 +1,8 @@ +group: kmmlu_direct_hard_applied_science +task: + - kmmlu_direct_hard_applied_science_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_humss.yaml b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_humss.yaml new file mode 100644 index 0000000000..b28fdd1522 --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_humss.yaml @@ -0,0 +1,8 @@ +group: kmmlu_direct_hard_humss +task: + - kmmlu_direct_hard_humss_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_other.yaml b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_other.yaml new file mode 100644 index 0000000000..f216caa648 --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_other.yaml @@ -0,0 +1,8 @@ +group: kmmlu_direct_hard_other +task: + - kmmlu_direct_hard_other_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_stem.yaml b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_stem.yaml new file mode 100644 index 0000000000..026c6b4892 --- /dev/null +++ b/lm_eval/tasks/kmmlu/direct_hard/_kmmlu_direct_hard_stem.yaml @@ -0,0 +1,8 @@ +group: kmmlu_direct_hard_stem +task: + - kmmlu_direct_hard_stem_tasks +aggregate_metric_list: + - metric: exact_match + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_accounting.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_accounting.yaml index ca805e955e..d92b933d4b 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_accounting.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_accounting.yaml @@ -1,3 +1,4 @@ dataset_name: accounting include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_accounting +task: kmmlu_direct_hard_accounting +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_agricultural_sciences.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_agricultural_sciences.yaml index 7348344468..d78427d021 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_agricultural_sciences.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_agricultural_sciences.yaml @@ -1,3 +1,4 @@ dataset_name: agricultural_sciences include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_agricultural_sciences +task: kmmlu_direct_hard_agricultural_sciences +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml index 25c91cb6e5..6713f04da2 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml @@ -1,3 +1,4 @@ dataset_name: aviation_engineering_and_maintenance include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_aviation_engineering_and_maintenance +task: kmmlu_direct_hard_aviation_engineering_and_maintenance +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml index a7bc8417b0..e98a380f92 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml @@ -1,3 +1,4 @@ dataset_name: biology include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_biology +task: kmmlu_direct_hard_biology +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemical_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemical_engineering.yaml index 063974afd2..b505e3175f 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: chemical_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_chemical_engineering +task: kmmlu_direct_hard_chemical_engineering +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml index 371db7bfbf..d805e2340f 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml @@ -1,3 +1,4 @@ dataset_name: chemistry include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_chemistry +task: kmmlu_direct_hard_chemistry +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml index ba2c23b2d1..30622d50c6 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: civil_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_civil_engineering +task: kmmlu_direct_hard_civil_engineering +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml index 2a388ff474..bc0f5a37a1 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml @@ -1,3 +1,4 @@ dataset_name: computer_science include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_computer_science +task: kmmlu_direct_hard_computer_science +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_construction.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_construction.yaml index faab391b90..e050e10675 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_construction.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_construction.yaml @@ -1,3 +1,4 @@ dataset_name: construction include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_construction +task: kmmlu_direct_hard_construction +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_criminal_law.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_criminal_law.yaml index d2679f1ecd..3072b6f0b5 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_criminal_law.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_criminal_law.yaml @@ -1,3 +1,4 @@ dataset_name: criminal_law include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_criminal_law +task: kmmlu_direct_hard_criminal_law +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml index adedf9d6e7..3129f467d2 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml @@ -1,3 +1,4 @@ dataset_name: ecology include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_ecology +task: kmmlu_direct_hard_ecology +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml index f42e5b8dad..87069840e6 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml @@ -1,3 +1,4 @@ dataset_name: economics include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_economics +task: kmmlu_direct_hard_economics +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml index 9c90432fe2..75baa1364b 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml @@ -1,3 +1,4 @@ dataset_name: education include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_education +task: kmmlu_direct_hard_education +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electrical_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electrical_engineering.yaml index 780dad2268..789cdfb81c 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electrical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electrical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: electrical_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_electrical_engineering +task: kmmlu_direct_hard_electrical_engineering +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml index e01781549f..9a1736e0b2 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: electronics_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_electronics_engineering +task: kmmlu_direct_hard_electronics_engineering +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_energy_management.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_energy_management.yaml index d4c2ca7d64..4653272e02 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_energy_management.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_energy_management.yaml @@ -1,3 +1,4 @@ dataset_name: energy_management include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_energy_management +task: kmmlu_direct_hard_energy_management +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml index de511a09f0..60c0253e0f 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml @@ -1,3 +1,4 @@ dataset_name: environmental_science include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_environmental_science +task: kmmlu_direct_hard_environmental_science +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_fashion.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_fashion.yaml index 26f0617dfb..86bbb9b49c 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_fashion.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_fashion.yaml @@ -1,3 +1,4 @@ dataset_name: fashion include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_fashion +task: kmmlu_direct_hard_fashion +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml index e48143d2c3..6b2817d2c0 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml @@ -1,3 +1,4 @@ dataset_name: food_processing include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_food_processing +task: kmmlu_direct_hard_food_processing +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml index eb5211ad85..c2d2f4772b 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: gas_technology_and_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_gas_technology_and_engineering +task: kmmlu_direct_hard_gas_technology_and_engineering +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml index a25f3c1a7e..9dadc72dc3 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml @@ -1,3 +1,4 @@ dataset_name: geomatics include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_geomatics +task: kmmlu_direct_hard_geomatics +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_health.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_health.yaml index 0fef809eeb..f1bf4c778c 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_health.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_health.yaml @@ -1,3 +1,4 @@ dataset_name: health include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_health +task: kmmlu_direct_hard_health +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml index d7ca26e58a..5f7b73ea5f 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml @@ -1,3 +1,4 @@ dataset_name: industrial_engineer include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_industrial_engineer +task: kmmlu_direct_hard_industrial_engineer +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_information_technology.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_information_technology.yaml index 0f8d01ec92..a1c5cf9dbf 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_information_technology.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_information_technology.yaml @@ -1,3 +1,4 @@ dataset_name: information_technology include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_information_technology +task: kmmlu_direct_hard_information_technology +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml index 3b1303810a..65a20727fc 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml @@ -1,3 +1,4 @@ dataset_name: interior_architecture_and_design include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_interior_architecture_and_design +task: kmmlu_direct_hard_interior_architecture_and_design +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_korean_history.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_korean_history.yaml index c4d595d196..c10a9f576f 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_korean_history.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_korean_history.yaml @@ -1,3 +1,4 @@ dataset_name: korean_history include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_korean_history +task: kmmlu_direct_hard_korean_history +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml index 168f034059..96e5514f25 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml @@ -1,3 +1,4 @@ dataset_name: law include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_law +task: kmmlu_direct_hard_law +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_machine_design_and_manufacturing.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_machine_design_and_manufacturing.yaml index 73665b1bc0..50dfd63b23 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_machine_design_and_manufacturing.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_machine_design_and_manufacturing.yaml @@ -1,3 +1,4 @@ dataset_name: machine_design_and_manufacturing include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_machine_design_and_manufacturing +task: kmmlu_direct_hard_machine_design_and_manufacturing +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml index 6eb945d27e..48c339d743 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml @@ -1,3 +1,4 @@ dataset_name: management include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_management +task: kmmlu_direct_hard_management +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml index 4078cf973b..937bfd27f2 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: maritime_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_maritime_engineering +task: kmmlu_direct_hard_maritime_engineering +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_marketing.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_marketing.yaml index 37d62bb1ba..1ae4088a16 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_marketing.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_marketing.yaml @@ -1,3 +1,4 @@ dataset_name: marketing include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_marketing +task: kmmlu_direct_hard_marketing +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_materials_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_materials_engineering.yaml index c1e2645c2b..432460ebf7 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_materials_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_materials_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: materials_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_materials_engineering +task: kmmlu_direct_hard_materials_engineering +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml index f5f3373a8a..53d2fca14d 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml @@ -1,3 +1,4 @@ dataset_name: math include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_math +task: kmmlu_direct_hard_math +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml index dae55511a9..1a3994ea59 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: mechanical_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_mechanical_engineering +task: kmmlu_direct_hard_mechanical_engineering +tag: kmmlu_direct_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml index 3ff9583743..909c502c02 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml @@ -1,3 +1,4 @@ dataset_name: nondestructive_testing include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_nondestructive_testing +task: kmmlu_direct_hard_nondestructive_testing +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml index d913752b0b..d8faf97237 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml @@ -1,3 +1,4 @@ dataset_name: patent include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_patent +task: kmmlu_direct_hard_patent +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml index 8a5d96b600..0b65050746 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml @@ -1,3 +1,4 @@ dataset_name: political_science_and_sociology include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_political_science_and_sociology +task: kmmlu_direct_hard_political_science_and_sociology +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml index 9fbf0d3191..b1a6f7777f 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml @@ -1,3 +1,4 @@ dataset_name: psychology include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_psychology +task: kmmlu_direct_hard_psychology +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml index b376c4ebae..3da462946a 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml @@ -1,3 +1,4 @@ dataset_name: public_safety include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_public_safety +task: kmmlu_direct_hard_public_safety +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml index 0eb534e579..74e5e02f43 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: railway_and_automotive_engineering include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_railway_and_automotive_engineering +task: kmmlu_direct_hard_railway_and_automotive_engineering +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml index 9c3df599ee..8f23fae524 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml @@ -1,3 +1,4 @@ dataset_name: real_estate include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_real_estate +task: kmmlu_direct_hard_real_estate +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml index f62e8e9559..192a1f2c0d 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml @@ -1,3 +1,4 @@ dataset_name: refrigerating_machinery include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_refrigerating_machinery +task: kmmlu_direct_hard_refrigerating_machinery +tag: kmmlu_direct_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml index ad4dc2cf37..c24babc33a 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml @@ -1,3 +1,4 @@ dataset_name: social_welfare include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_social_welfare +task: kmmlu_direct_hard_social_welfare +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml index 445ab693d6..17586af6d6 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml @@ -1,3 +1,4 @@ dataset_name: taxation include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_taxation +task: kmmlu_direct_hard_taxation +tag: kmmlu_direct_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml index 498b2fb2d6..bed0df91c9 100644 --- a/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml +++ b/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml @@ -1,3 +1,4 @@ dataset_name: telecommunications_and_wireless_technology include: _direct_hard_kmmlu_yaml -task: kmmlu_hard_direct_telecommunications_and_wireless_technology +task: kmmlu_direct_hard_telecommunications_and_wireless_technology +tag: kmmlu_direct_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml b/lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml index 26c4105b31..b3e6970527 100644 --- a/lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml +++ b/lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml @@ -1,6 +1,3 @@ -tag: - - kmmlu - - kmmlu_hard dataset_path: HAERAE-HUB/KMMLU-HARD output_type: multiple_choice test_split: test @@ -12,8 +9,5 @@ metric_list: - metric: acc aggregation: mean higher_is_better: true - - metric: acc_norm - aggregation: mean - higher_is_better: true metadata: version: 2.0 diff --git a/lm_eval/tasks/kmmlu/hard/_kmmlu_hard.yaml b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard.yaml new file mode 100644 index 0000000000..827e74ec10 --- /dev/null +++ b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard.yaml @@ -0,0 +1,11 @@ +group: kmmlu_hard +task: + - kmmlu_hard_stem + - kmmlu_hard_other + - kmmlu_hard_applied_science + - kmmlu_hard_humss +aggregate_metric_list: + - metric: acc + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_applied_science.yaml b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_applied_science.yaml new file mode 100644 index 0000000000..76d383af04 --- /dev/null +++ b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_applied_science.yaml @@ -0,0 +1,8 @@ +group: kmmlu_hard_applied_science +task: + - kmmlu_hard_applied_science_tasks +aggregate_metric_list: + - metric: acc + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_humss.yaml b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_humss.yaml new file mode 100644 index 0000000000..39eb5a7a26 --- /dev/null +++ b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_humss.yaml @@ -0,0 +1,8 @@ +group: kmmlu_hard_humss +task: + - kmmlu_hard_humss_tasks +aggregate_metric_list: + - metric: acc + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_other.yaml b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_other.yaml new file mode 100644 index 0000000000..5759fe8844 --- /dev/null +++ b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_other.yaml @@ -0,0 +1,8 @@ +group: kmmlu_hard_other +task: + - kmmlu_hard_other_tasks +aggregate_metric_list: + - metric: acc + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_stem.yaml b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_stem.yaml new file mode 100644 index 0000000000..ee14c72641 --- /dev/null +++ b/lm_eval/tasks/kmmlu/hard/_kmmlu_hard_stem.yaml @@ -0,0 +1,8 @@ +group: kmmlu_hard_stem +task: + - kmmlu_hard_stem_tasks +aggregate_metric_list: + - metric: acc + weight_by_size: True +metadata: + version: 2.0 diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml index 8112903b53..0c341baac0 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml @@ -1,3 +1,4 @@ dataset_name: accounting include: _hard_kmmlu_yaml task: kmmlu_hard_accounting +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_agricultural_sciences.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_agricultural_sciences.yaml index 3a20948b62..90d284c8f7 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_agricultural_sciences.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_agricultural_sciences.yaml @@ -1,3 +1,4 @@ dataset_name: agricultural_sciences include: _hard_kmmlu_yaml task: kmmlu_hard_agricultural_sciences +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_aviation_engineering_and_maintenance.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_aviation_engineering_and_maintenance.yaml index 87b3845f28..5ec90f362f 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_aviation_engineering_and_maintenance.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_aviation_engineering_and_maintenance.yaml @@ -1,3 +1,4 @@ dataset_name: aviation_engineering_and_maintenance include: _hard_kmmlu_yaml task: kmmlu_hard_aviation_engineering_and_maintenance +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_biology.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_biology.yaml index 0a28b7c7ca..045e17e780 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_biology.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_biology.yaml @@ -1,3 +1,4 @@ dataset_name: biology include: _hard_kmmlu_yaml task: kmmlu_hard_biology +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemical_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemical_engineering.yaml index 8fc448a81a..cbfa42eb20 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: chemical_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_chemical_engineering +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemistry.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemistry.yaml index 366c95026d..67c65d6598 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemistry.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemistry.yaml @@ -1,3 +1,4 @@ dataset_name: chemistry include: _hard_kmmlu_yaml task: kmmlu_hard_chemistry +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_civil_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_civil_engineering.yaml index ba1a15ad8c..58e3c87a84 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_civil_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_civil_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: civil_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_civil_engineering +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_computer_science.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_computer_science.yaml index 4e1f121352..42f9146767 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_computer_science.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_computer_science.yaml @@ -1,3 +1,4 @@ dataset_name: computer_science include: _hard_kmmlu_yaml task: kmmlu_hard_computer_science +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_construction.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_construction.yaml index 8331379cf2..55a5a1d0d9 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_construction.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_construction.yaml @@ -1,3 +1,4 @@ dataset_name: construction include: _hard_kmmlu_yaml task: kmmlu_hard_construction +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_criminal_law.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_criminal_law.yaml index b7acd49a06..14e4d5ad65 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_criminal_law.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_criminal_law.yaml @@ -1,3 +1,4 @@ dataset_name: criminal_law include: _hard_kmmlu_yaml task: kmmlu_hard_criminal_law +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_ecology.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_ecology.yaml index 6542c1eef9..c737b1abaf 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_ecology.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_ecology.yaml @@ -1,3 +1,4 @@ dataset_name: ecology include: _hard_kmmlu_yaml task: kmmlu_hard_ecology +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_economics.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_economics.yaml index 4f1bfba065..9a0084dc38 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_economics.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_economics.yaml @@ -1,3 +1,4 @@ dataset_name: economics include: _hard_kmmlu_yaml task: kmmlu_hard_economics +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_education.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_education.yaml index 0f6a6a8078..568d094d67 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_education.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_education.yaml @@ -1,3 +1,4 @@ dataset_name: education include: _hard_kmmlu_yaml task: kmmlu_hard_education +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_electrical_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_electrical_engineering.yaml index 51625c1ec3..ad46c48657 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_electrical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_electrical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: electrical_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_electrical_engineering +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_electronics_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_electronics_engineering.yaml index 252ecc19d5..843c92a056 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_electronics_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_electronics_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: electronics_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_electronics_engineering +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_energy_management.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_energy_management.yaml index 062204f1de..dcfe7f36c1 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_energy_management.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_energy_management.yaml @@ -1,3 +1,4 @@ dataset_name: energy_management include: _hard_kmmlu_yaml task: kmmlu_hard_energy_management +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_environmental_science.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_environmental_science.yaml index d7f32dc5b5..a0ae1b8191 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_environmental_science.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_environmental_science.yaml @@ -1,3 +1,4 @@ dataset_name: environmental_science include: _hard_kmmlu_yaml task: kmmlu_hard_environmental_science +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_fashion.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_fashion.yaml index 9448efcf8c..3ba973ba6a 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_fashion.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_fashion.yaml @@ -1,3 +1,4 @@ dataset_name: fashion include: _hard_kmmlu_yaml task: kmmlu_hard_fashion +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_food_processing.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_food_processing.yaml index 138920efbc..cd08fe3b99 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_food_processing.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_food_processing.yaml @@ -1,3 +1,4 @@ dataset_name: food_processing include: _hard_kmmlu_yaml task: kmmlu_hard_food_processing +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_gas_technology_and_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_gas_technology_and_engineering.yaml index 14e213b583..fe30680ae6 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_gas_technology_and_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_gas_technology_and_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: gas_technology_and_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_gas_technology_and_engineering +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_geomatics.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_geomatics.yaml index 0370a7a755..53b52e96ed 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_geomatics.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_geomatics.yaml @@ -1,3 +1,4 @@ dataset_name: geomatics include: _hard_kmmlu_yaml task: kmmlu_hard_geomatics +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_health.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_health.yaml index c5e2ba98ad..dcd2b179d6 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_health.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_health.yaml @@ -1,3 +1,4 @@ dataset_name: health include: _hard_kmmlu_yaml task: kmmlu_hard_health +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_industrial_engineer.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_industrial_engineer.yaml index d3cbef78bf..2e8449ffd7 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_industrial_engineer.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_industrial_engineer.yaml @@ -1,3 +1,4 @@ dataset_name: industrial_engineer include: _hard_kmmlu_yaml task: kmmlu_hard_industrial_engineer +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_information_technology.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_information_technology.yaml index 4af23d3030..86ded35de1 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_information_technology.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_information_technology.yaml @@ -1,3 +1,4 @@ dataset_name: information_technology include: _hard_kmmlu_yaml task: kmmlu_hard_information_technology +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_interior_architecture_and_design.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_interior_architecture_and_design.yaml index 76bfe50c34..55de26414f 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_interior_architecture_and_design.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_interior_architecture_and_design.yaml @@ -1,3 +1,4 @@ dataset_name: interior_architecture_and_design include: _hard_kmmlu_yaml task: kmmlu_hard_interior_architecture_and_design +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_korean_history.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_korean_history.yaml index 60ff94e7ff..4d4152b794 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_korean_history.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_korean_history.yaml @@ -1,3 +1,4 @@ dataset_name: korean_history include: _hard_kmmlu_yaml task: kmmlu_hard_korean_history +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_law.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_law.yaml index aeec24dcd3..0a75d9041c 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_law.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_law.yaml @@ -1,3 +1,4 @@ dataset_name: law include: _hard_kmmlu_yaml task: kmmlu_hard_law +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_machine_design_and_manufacturing.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_machine_design_and_manufacturing.yaml index 222f89bacd..210ffd8feb 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_machine_design_and_manufacturing.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_machine_design_and_manufacturing.yaml @@ -1,3 +1,4 @@ dataset_name: machine_design_and_manufacturing include: _hard_kmmlu_yaml task: kmmlu_hard_machine_design_and_manufacturing +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_management.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_management.yaml index 8e9e866499..d3f27519e2 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_management.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_management.yaml @@ -1,3 +1,4 @@ dataset_name: management include: _hard_kmmlu_yaml task: kmmlu_hard_management +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_maritime_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_maritime_engineering.yaml index e68041d509..dec43bc804 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_maritime_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_maritime_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: maritime_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_maritime_engineering +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_marketing.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_marketing.yaml index 54a62d6272..f86cfe17bc 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_marketing.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_marketing.yaml @@ -1,3 +1,4 @@ dataset_name: marketing include: _hard_kmmlu_yaml task: kmmlu_hard_marketing +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_materials_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_materials_engineering.yaml index 4582b0f3b4..684120a077 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_materials_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_materials_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: materials_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_materials_engineering +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_math.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_math.yaml index e563717686..ed125f90bf 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_math.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_math.yaml @@ -1,3 +1,4 @@ dataset_name: math include: _hard_kmmlu_yaml task: kmmlu_hard_math +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_mechanical_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_mechanical_engineering.yaml index 9b3adca0b6..b6d00e2e2b 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_mechanical_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_mechanical_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: mechanical_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_mechanical_engineering +tag: kmmlu_hard_stem_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_nondestructive_testing.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_nondestructive_testing.yaml index 21c25fc87b..acf3ed9fd9 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_nondestructive_testing.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_nondestructive_testing.yaml @@ -1,3 +1,4 @@ dataset_name: nondestructive_testing include: _hard_kmmlu_yaml task: kmmlu_hard_nondestructive_testing +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_patent.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_patent.yaml index 3fcdcd96b1..910f11c54c 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_patent.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_patent.yaml @@ -1,3 +1,4 @@ dataset_name: patent include: _hard_kmmlu_yaml task: kmmlu_hard_patent +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_political_science_and_sociology.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_political_science_and_sociology.yaml index 6bb907cb10..7b7addfdb3 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_political_science_and_sociology.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_political_science_and_sociology.yaml @@ -1,3 +1,4 @@ dataset_name: political_science_and_sociology include: _hard_kmmlu_yaml task: kmmlu_hard_political_science_and_sociology +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_psychology.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_psychology.yaml index c79cef1f1c..a6d8b754e2 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_psychology.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_psychology.yaml @@ -1,3 +1,4 @@ dataset_name: psychology include: _hard_kmmlu_yaml task: kmmlu_hard_psychology +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_public_safety.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_public_safety.yaml index 110bd147e7..8b04b78e59 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_public_safety.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_public_safety.yaml @@ -1,3 +1,4 @@ dataset_name: public_safety include: _hard_kmmlu_yaml task: kmmlu_hard_public_safety +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_railway_and_automotive_engineering.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_railway_and_automotive_engineering.yaml index 31b610f75e..358b7e36ab 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_railway_and_automotive_engineering.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_railway_and_automotive_engineering.yaml @@ -1,3 +1,4 @@ dataset_name: railway_and_automotive_engineering include: _hard_kmmlu_yaml task: kmmlu_hard_railway_and_automotive_engineering +tag: kmmlu_hard_applied_science_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_real_estate.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_real_estate.yaml index bd1b32c858..9010e2a746 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_real_estate.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_real_estate.yaml @@ -1,3 +1,4 @@ dataset_name: real_estate include: _hard_kmmlu_yaml task: kmmlu_hard_real_estate +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_refrigerating_machinery.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_refrigerating_machinery.yaml index 8c7dd13999..5f03b70ba2 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_refrigerating_machinery.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_refrigerating_machinery.yaml @@ -1,3 +1,4 @@ dataset_name: refrigerating_machinery include: _hard_kmmlu_yaml task: kmmlu_hard_refrigerating_machinery +tag: kmmlu_hard_other_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_social_welfare.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_social_welfare.yaml index 12502a573e..24f105e467 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_social_welfare.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_social_welfare.yaml @@ -1,3 +1,4 @@ dataset_name: social_welfare include: _hard_kmmlu_yaml task: kmmlu_hard_social_welfare +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_taxation.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_taxation.yaml index f0f815abe4..7d0bbf86c5 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_taxation.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_taxation.yaml @@ -1,3 +1,4 @@ dataset_name: taxation include: _hard_kmmlu_yaml task: kmmlu_hard_taxation +tag: kmmlu_hard_humss_tasks diff --git a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_telecommunications_and_wireless_technology.yaml b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_telecommunications_and_wireless_technology.yaml index 0cb519d11e..c1398c5f32 100644 --- a/lm_eval/tasks/kmmlu/hard/kmmlu_hard_telecommunications_and_wireless_technology.yaml +++ b/lm_eval/tasks/kmmlu/hard/kmmlu_hard_telecommunications_and_wireless_technology.yaml @@ -1,3 +1,4 @@ dataset_name: telecommunications_and_wireless_technology include: _hard_kmmlu_yaml task: kmmlu_hard_telecommunications_and_wireless_technology +tag: kmmlu_hard_applied_science_tasks