From 36021fcac74c7e8556e0e91060f730f6ce25c678 Mon Sep 17 00:00:00 2001 From: 939051420 <939051420@qq.com> Date: Wed, 4 Sep 2024 23:04:47 +0800 Subject: [PATCH] =?UTF-8?q?merge,=20=E5=A2=9E=E5=8A=A0=E4=BA=86drop?= =?UTF-8?q?=E5=92=8Cxsum=EF=BC=8C=E5=AE=8C=E5=96=84=E4=BA=86=E8=BE=93?= =?UTF-8?q?=E5=87=BA=EF=BC=8C=E4=BF=AE=E5=A4=8D=E4=BA=86=E6=8E=A8=E7=90=86?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 101 +++++--- config.json | 8 + config_debug.json | 26 +- lm_cute_eval/get_multiround_prompt.py | 3 + lm_cute_eval/model.py | 24 +- lm_cute_eval/run.py | 48 ++-- lm_cute_eval/run_score.py | 241 ------------------ lm_cute_eval/tasks/drop/config_drop.json | 1 + lm_cute_eval/tasks/drop/load_data_drop.py | 23 +- lm_cute_eval/tasks/drop/match_answer_drop.py | 67 +++-- lm_cute_eval/tasks/gsm8k/config_gsm8k.json | 1 + lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py | 10 +- .../tasks/gsm8k/match_answer_gsm8k.py | 17 +- .../tasks/hellaswag/config_hellaswag.json | 4 +- .../tasks/hellaswag/load_data_hellaswag.py | 24 +- .../tasks/humaneval/config_humaneval.json | 1 + .../tasks/humaneval/load_data_humaneval.py | 13 +- .../tasks/icleval/match_answer_icleval.py | 2 + lm_cute_eval/tasks/mmlu/match_answer_mmlu.py | 26 +- lm_cute_eval/tasks/xsum/config_xsum.json | 1 + lm_cute_eval/tasks/xsum/load_data_xsum.py | 22 +- lm_cute_eval/tasks/xsum/match_answer_xsum.py | 89 ++----- run.sh | 15 +- 23 files changed, 237 insertions(+), 530 deletions(-) create mode 100644 lm_cute_eval/get_multiround_prompt.py delete mode 100644 lm_cute_eval/run_score.py diff --git a/README.md b/README.md index 11430fd..e62d69c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # lm-cute-eval:一个轻量级的大语言模型评测框架 -这是一个轻量级的大语言模型评测框架,目前支持少量常用评测集,其优点在于不同任务模块之间解耦,扩展性强,可以较方便地添加新的任务。该评测框架使用vllm库进行推理,暂时不支持使用transformers自带的推理函数。 +这是一个轻量级的大语言模型评测框架,目前支持少量常用评测集,其优点在于不同任务模块之间解耦,扩展性强,可以较方便地添加新的任务。该评测框架使用transformers和vllm库进行推理。 ## 开始运行 @@ -21,69 +21,89 @@ unzip data.zip 编辑run.sh脚本,需要考虑的参数如下: ``` +模型配置 model_path: 模型绝对路径 -model_type: 模型类型,用于控制prompt格式,默认default -sampling_params: vllm推理框架使用的参数 -tasks: 需要评测的任务名称,用空格隔开,例如需要评测mmlu和gsm8k,则在命令中加入--tasks gsm8k mmlu -save_name: 输出文件夹名称 +model_type: 模型变量类型,默认vllm,目前可选vllm和hf +format_type: 模型类型,用于控制prompt格式,默认default + +任务配置 +tasks: 需要评测的任务名称,用空格隔开,例如需要评测mmlu和gsm8k,则在命令中加入--tasks gsm8k mmlu,也可以包含一个all,自动评测所有任务。 +config_path: 任务配置文件路径,缺失值自动填充为对应任务文件夹中的默认config。 +data_path(不需要修改): 数据集路径。 + +保存配置 +output_path: 输出目录,默认output +save_name: 输出文件夹名称。 save_infer_results: 保存推理结果,而非只保存一个分数 -config_path: 任务配置文件路径 -output_path: 输出目录,默认./output +save_infer_texts: 保存便于阅读的输入输出文本到infer_result{round_idx}.txt no_timestamp: 输出文件夹不包含时间,若包含时间,则会保存到"./output/{time}_{model_name}/"中 +temp_file_path(不需要修改): 临时文件保存目录,主要用于humaneval评测集。 + +推理配置 +rounds: 推理轮数(用于其他实验,需要自己控制中间对话的prompt,具体见lm_cute_eval/get_multiround_prompt.py。 +seed: 随机种子。 +use_cpu(不需要使用): 使用CPU推理(用于debug)。 +temperature:模型推理参数 +top_p:模型推理参数 +top_k:模型推理参数 +max_new_tokens: 最多生成的token数量,默认160,不同数据集不一样,且本框架不可以分开设置每个任务的new token数量,所以取了个较大的值。 ``` -可能不需要考虑的参数: -``` -rounds: 推理轮数,默认1,仅用于其他实验 -refine_prompt: 多轮推理过程中的prompt,仅用于其他实验 -temp_file_path: 临时文件保存目录,主要用于humaneval评测集 -``` -例如你想要评测mmlu和gsm8k: +例如你想用mmlu和gsm8k评测两个模型: ```bash -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=3 export TOKENIZERS_PARALLELISM=false declare -A models=( - ["model_name"]="model_path" + ["model_name1"]="model_path1" + ["model_name2"]="model_path2" ) for model_name in "${!models[@]}"; do model_path=${models[$model_name]} python main.py \ --model_path "$model_path" \ - --model_type default \ - --tasks gsm8k mmlu \ + --model_type vllm \ + --format_type default \ + --tasks mmlu gsm8k \ --save_name "$model_name" \ + --save_infer_texts \ --save_infer_results \ - --config_path config.json \ - --output_path output + --config_path "config.json" \ + --output_path output/debug \ + --max_new_tokens 180 \ + --temperature 0.1 \ + --top_p 0.2 \ + --top_k 20 \ + done + ``` 配置config: -根目录下,有默认config.json文件,可以根据需要来修改config中的内容,其格式如下: +根目录下,有默认config.json文件,在每个数据集的config文件中也又默认值,可以根据需要来修改config中的内容,其格式如下: ``` { - task_name_1: task_config_1, - task_name_2: taks_config_2, - ... + task_name_1: task_config_1, + task_name_2: taks_config_2, + ... } 例如: { - "gsm8k": { - "num_fewshot": 8, - "limit": 0 - } - "mmlu": { - "num_fewshot": 5, - "limit": null - } + "gsm8k": { + "num_fewshot": 8, + "limit": 0 + } + "mmlu": { + "num_fewshot": 5, + "limit": null + } } ``` @@ -97,7 +117,7 @@ subjects: (list) 需要评测的子任务的名称列表,例如mmlu中有abstr -## 评测任务细节 +## 评测任务介绍 ### arc @@ -135,7 +155,7 @@ flexible_match:匹配回答中任何数字,有一个正确则为正确。 ### icleval -数据集来源:使用原始数据([ICLEval/data/tasks_data](https://github.com/yiye3/ICLEval/tree/main/data/tasks_data)),修复了部分文件中的错误,且不支持copy_dict_search_string.json、copy_natural_language_string.json这两个子任务。 +数据集来源:[ICLEval/data/tasks_data](https://github.com/yiye3/ICLEval/tree/main/data/tasks_data),修复了部分文件中"examples"写成"exmaples"的拼写错误。 评测指标:若标准答案是模型生成文本的子串,则为正确。 @@ -151,3 +171,16 @@ flexible_match:匹配回答中任何数字,有一个正确则为正确。 评测指标:匹配回答中的第一个选项,判断是否正确。 +### rgb + +数据集来源:[chen700564/RGB](https://github.com/chen700564) + +评测指标:有一个可能的回答出现在答案里即为正确。 + +### xsum + +数据集来源:[新建标签页 (github.com)](https://github.com/EdinburghNLP/XSum) + +评测指标:用BAAI/bge-m3模型计算和答案的相似度。 + +注意,该功能由于依赖包较复杂,暂未完善,全部被注释掉了,自己配好环境是可以用的。 \ No newline at end of file diff --git a/config.json b/config.json index 9049ef5..27ff4a9 100644 --- a/config.json +++ b/config.json @@ -1,4 +1,12 @@ { + "arc": { + "arc_e": { + "num_fewshot": 0 + }, + "arc_c": { + "num_fewshot": 25 + } + }, "commonsenseqa": { "num_fewshot": 7 }, diff --git a/config_debug.json b/config_debug.json index 4926485..ee81446 100644 --- a/config_debug.json +++ b/config_debug.json @@ -2,42 +2,48 @@ "arc": { "arc_e": { "num_fewshot": 0, - "limit": 10 + "limit": 1 }, "arc_c": { "num_fewshot": 25, - "limit": 10 + "limit": 1 } }, "commonsenseqa": { "num_fewshot": 7, - "limit": 10 + "limit": 1 }, "drop": { "num_fewshot": 5, - "limit": 10 + "limit": 1 }, "gsm8k": { "num_fewshot": 8, - "limit": 10 + "limit": 1 }, "hellaswag": { "num_fewshot": 4, - "limit": 10 + "limit": 1 }, "humaneval": { - "limit": 10 + "limit": 1 + }, + "icleval": { + "limit": 1 }, "mmlu": { "num_fewshot": 5, - "limit": 10 + "limit": 1 + }, + "rgb": { + "limit": 1 }, "winogrande": { "num_fewshot": 5, - "limit": 10 + "limit": 1 }, "xsum":{ "num_fewshot": 5, - "limit": 10 + "limit": 1 } } \ No newline at end of file diff --git a/lm_cute_eval/get_multiround_prompt.py b/lm_cute_eval/get_multiround_prompt.py new file mode 100644 index 0000000..8941fef --- /dev/null +++ b/lm_cute_eval/get_multiround_prompt.py @@ -0,0 +1,3 @@ +def get_multiround_prompt(round_idx, args): + refine_prompt = "Please further think about and give me a more precise and professional answer.\n" + return refine_prompt \ No newline at end of file diff --git a/lm_cute_eval/model.py b/lm_cute_eval/model.py index 55d40bf..b661113 100644 --- a/lm_cute_eval/model.py +++ b/lm_cute_eval/model.py @@ -15,8 +15,8 @@ def __init__(self, args) -> None: sampling_kwargs = { "top_p": args.top_p, "top_k": args.top_k, - "max_tokens": args.max_new_tokens, "temperature": args.temperature, + "max_tokens": args.max_new_tokens, "stop": [ "Question:", "", @@ -29,12 +29,7 @@ def __init__(self, args) -> None: "Input" ] } - if args.top_p: - sampling_kwargs.update({"top_p": args.top_p}) - if args.temperature: - sampling_kwargs.update({"temperature": args.temperature}) - if args.top_k: - sampling_kwargs.update({"top_K": args.top_k}) + sampling_kwargs = {k: v for k, v in sampling_kwargs.items() if v is not None} self.sampling_params = SamplingParams(**sampling_kwargs) def generate(self, prompts): @@ -53,15 +48,12 @@ def __init__(self, args) -> None: self.generate_kwargs = { "max_new_tokens": args.max_new_tokens, "pad_token_id": self.tokenizer.pad_token_id, + "temperature": args.temperature, + "top_p": args.top_p, + "top_k": args.top_k, + "do_sample": True, } - - if args.temperature or args.top_p or args.top_k: - self.generate_kwargs.update({ - "temperature": args.temperature, - "top_p": args.top_p, - "top_k": args.top_k, - "do_sample": True, - }) + self.generate_kwargs = {k: v for k, v in self.generate_kwargs.items() if v is not None} self.model = AutoModelForCausalLM.from_pretrained(args.model_path).to(self.device) def generate(self, prompts): @@ -73,7 +65,7 @@ def generate(self, prompts): **self.generate_kwargs ) output = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - generated_texts.append(output.strip()) + generated_texts.append(output[len(prompt):].strip()) return generated_texts diff --git a/lm_cute_eval/run.py b/lm_cute_eval/run.py index 20b8cd5..a405359 100644 --- a/lm_cute_eval/run.py +++ b/lm_cute_eval/run.py @@ -5,7 +5,9 @@ from collections import defaultdict from tqdm import tqdm import os +import torch +from .get_multiround_prompt import get_multiround_prompt from .model import initialize_model from .utils import TASK_LIST, MODEL_FORMAT, LOAD_TASK_DATA, MATCH_TASK_ANSWER @@ -103,6 +105,7 @@ def run_infer(tasks_data:dict, model, args): for item in tasks_data[task][subject]: if round_idx == 1: prompt = item["instruction"] + item["fewshot_prompt"] + item["prompt_round1"] + prompt = MODEL_FORMAT[args.format_type](prompt, history=[]) else: history = [] for i in range(1, round_idx): @@ -138,43 +141,19 @@ def run_infer(tasks_data:dict, model, args): for task in tasks_data: for subject in tasks_data[task]: for item in tasks_data[task][subject]: - item[f"prompt_round{round_idx + 1}"] = args.refine_prompt + item[f"prompt_round{round_idx + 1}"] = get_multiround_prompt(round_idx + 1, args) return infer_result -# from FlagEmbedding import BGEM3FlagModel + def run_eval(infer_results, args): result = defaultdict(dict) for round_idx in range(1, args.rounds + 1): result[f"round{round_idx}"] = {} - if "xsum" in args.tasks: - torch.cuda.empty_cache() - # model = BGEM3FlagModel('/data1/dcy/downloads/model/BAAI/bge-m3', use_fp16=True) - # args.model = model for task in args.tasks: - if task == "xsum": - continue result[f"round{round_idx}"][task] = MATCH_TASK_ANSWER[task](infer_results[task], round_idx, args) return result -def save_result_inference(infer_result:dict, args): - """ - infer_result: dict[task(str), dict[subject(str), item(dict)]] - score: dict[round{i}(str), dict[task(str), dict[subject(str), item(dict)]]] - """ - # save infer results in file - if args.save_infer_results: - infer_result_path = os.path.join(args.save_path, "infer_results_withoutscore") - os.makedirs(infer_result_path, exist_ok=True) - for task in infer_result: - task_path = os.path.join(infer_result_path, task) - os.makedirs(task_path, exist_ok=True) - for subject in infer_result[task]: - subject_filename = os.path.join(task_path, f"{subject}.json") - with open(subject_filename, "w") as f: - json.dump(infer_result[task][subject], f, ensure_ascii=False, indent=4) - - def save_result(infer_result:dict, score:dict, args): """ @@ -195,8 +174,10 @@ def save_result(infer_result:dict, score:dict, args): # save evaluation result summary_score = {} + summary_score_with_subjects = {} for task in args.tasks: - for subject in score["round1"][task].keys(): + task_result_with_subjects = {} + for subject in infer_result[task]: subject_result_path = os.path.join(args.save_path, "eval_result", task) subject_result = {} for round_idx in range(1, args.rounds + 1): @@ -205,14 +186,23 @@ def save_result(infer_result:dict, score:dict, args): fn = os.path.join(subject_result_path, f"{subject}.json") with open(fn, "w") as f: json.dump(subject_result, f, indent=4) + + if args.rounds == 1: + task_result_with_subjects[subject] = subject_result["round1"] + else: + task_result_with_subjects[subject] = {f"round{round_idx}": subject_result[f"round{round_idx}"] for round_idx in range(1, args.rounds + 1)} + if args.rounds == 1: task_result = score[f"round1"][task][task] else: task_result = {f"round{round_idx}": score[f"round{round_idx}"][task][task] for round_idx in range(1, args.rounds + 1)} summary_score[task] = task_result + summary_score_with_subjects[task] = task_result_with_subjects with open(os.path.join(args.save_path, "summary.json"), "w") as f: json.dump(summary_score, f, indent=4) + with open(os.path.join(args.save_path, "summary_of_subjects.json"), "w") as f: + json.dump(summary_score_with_subjects, f, indent=4) print(json.dumps(summary_score, indent=4)) @@ -239,13 +229,11 @@ def get_args(): # generate config parser.add_argument("--rounds", type=int, default=1) parser.add_argument("--seed", type=int, default=123456) - parser.add_argument("--sampling_params", type=str, default=None) - parser.add_argument("--refine_prompt", type=str, default="Please further think about and give me a more precise and professional answer.\n") parser.add_argument("--use_cpu", action="store_true") parser.add_argument("--temperature", type=float, default=None) parser.add_argument("--top_p", type=float, default=None) parser.add_argument("--top_k", type=int, default=None) - parser.add_argument("--max_new_tokens", type=int, default=128) + parser.add_argument("--max_new_tokens", type=int, default=160) args = parser.parse_args() return args diff --git a/lm_cute_eval/run_score.py b/lm_cute_eval/run_score.py deleted file mode 100644 index f86f273..0000000 --- a/lm_cute_eval/run_score.py +++ /dev/null @@ -1,241 +0,0 @@ -import datetime -import json -import argparse -import os -from collections import defaultdict -import torch -from tqdm import tqdm -from vllm import LLM, SamplingParams -import os -import json -from argparse import Namespace -from FlagEmbedding import BGEM3FlagModel -from .model import init_vllm_model -from .utils import TASK_LIST, MODEL_FORMAT, LOAD_TASK_DATA, MATCH_TASK_ANSWER,TASKS_SUBJECTS - - - -def load_args_from_config(config_path: str): - """ - 从配置文件加载参数。 - - :param config_path: 配置文件的路径 - :return: 包含参数的 Namespace 对象 - """ - # 读取配置文件 - with open(config_path, 'r') as f: - config_dict = json.load(f) - - args = argparse.Namespace() - - # 遍历字典,将值赋给Namespace对象的属性 - for key, value in config_dict.items(): - setattr(args, key, value) - - # init task config - if "all" in args.tasks: - args.tasks = TASK_LIST - - with open(args.config_path, "r") as f: - args.tasks_config = json.load(f) - for task in args.tasks: - try: - with open(f"code/tasks/{task}/config_{task}.json", "r") as f: - default_task_config = json.load(f) - if task not in args.tasks_config: - args.tasks_config[task] = default_task_config - for k, v in default_task_config.items(): - if k not in args.tasks_config[task]: - args.tasks_config[task][k] = v - except FileNotFoundError: - pass - if "all" in args.tasks or "mmlu"in args.tasks: - if "subjects" not in args.tasks_config["mmlu"]: - args.tasks_config["mmlu"]["subjects"] = TASKS_SUBJECTS["mmlu"] - return args - -def get_tasks_data(args): - """ - return: - tasks_data: Dict[task(str), Dict[subject(str), List[item(dict)]]] - """ - tasks_data = defaultdict(list) - for task in tqdm(args.tasks, desc="load task data"): - tasks_data[task] = LOAD_TASK_DATA[task](args) - return tasks_data - - -def run_infer(tasks_data:dict, model:LLM, sampling_params:SamplingParams, args): - """ - params: - tasks_data: Dict[task(str), Dict[subject(str), List[item(dict)]]] - - returns: - infer_result: dict[task(str), dict[subject(str), item(dict)]] - """ - infer_result = dict(tasks_data) - for round_idx in range(1, args.rounds + 1): - print(f"running infer round {round_idx}") - # get all prompts - prompts = [] - for task in tasks_data: - for subject in tasks_data[task]: - for item in tasks_data[task][subject]: - if round_idx == 1: - prompt = item["instruction"] + item["fewshot_prompt"] + item["prompt_round1"] - else: - history = [] - history.append((item[f"prompt_round{1}"], item[f"infer_round{round_idx-1}"])) - query = item[f"prompt_round{round_idx}"] - prompt = MODEL_FORMAT[args.model_type](query, history) - prompts.append(prompt) - - outputs = model.generate(prompts, sampling_params) - generated_texts = [output.outputs[0].text for output in outputs] - - if args.save_infer_results: - with open(f"{args.save_path}/infer_round{round_idx}.txt", "w") as f: - for x, y in zip(prompts, generated_texts): - print("="*20, file=f) - print(x, file=f) - print("-"*20, file=f) - print(y, file=f) - - # save infer result in this round - cur_infer_idx = 0 - for task in tasks_data: - for subject in tasks_data[task]: - for item in tasks_data[task][subject]: - item[f"infer_round{round_idx}"] = generated_texts[cur_infer_idx] - cur_infer_idx += 1 - - # prepare prompt for next round - if round_idx == args.rounds: - break - for task in tasks_data: - for subject in tasks_data[task]: - for item in tasks_data[task][subject]: - item[f"prompt_round{round_idx + 1}"] = args.refine_prompt - - return infer_result - - -def run_eval(infer_results, args): - result = defaultdict(dict) - for round_idx in range(1, args.rounds + 1): - result[f"round{round_idx}"] = {} - print(args.tasks) - if "xsum" in args.tasks: - torch.cuda.empty_cache() - model = BGEM3FlagModel('/data1/dcy/downloads/model/BAAI/bge-m3', use_fp16=True) - args.model = model - for task in args.tasks: - print(task) - result[f"round{round_idx}"][task] = MATCH_TASK_ANSWER[task](infer_results[task], round_idx, args) - return result - -def save_result_inference(infer_result:dict, args): - """ - infer_result: dict[task(str), dict[subject(str), item(dict)]] - score: dict[round{i}(str), dict[task(str), dict[subject(str), item(dict)]]] - """ - # save infer results in file - if args.save_infer_results: - infer_result_path = os.path.join(args.save_path, "infer_results_withoutscore") - os.makedirs(infer_result_path, exist_ok=True) - for task in infer_result: - task_path = os.path.join(infer_result_path, task) - os.makedirs(task_path, exist_ok=True) - for subject in infer_result[task]: - subject_filename = os.path.join(task_path, f"{subject}.json") - with open(subject_filename, "w") as f: - json.dump(infer_result[task][subject], f, ensure_ascii=False, indent=4) - - - -def save_result(infer_result:dict, score:dict, args): - """ - infer_result: dict[task(str), dict[subject(str), item(dict)]] - score: dict[round{i}(str), dict[task(str), dict[subject(str), item(dict)]]] - """ - # save infer results in file - print("save infer results in file") - if args.save_infer_results: - infer_result_path = os.path.join(args.save_path, "infer_results") - os.makedirs(infer_result_path, exist_ok=True) - for task in infer_result: - print("task") - task_path = os.path.join(infer_result_path, task) - os.makedirs(task_path, exist_ok=True) - for subject in infer_result[task]: - subject_filename = os.path.join(task_path, f"{subject}.json") - with open(subject_filename, "w") as f: - json.dump(infer_result[task][subject], f, ensure_ascii=False, indent=4) - - # save evaluation result - summary_score = {} - for task in tqdm(args.tasks, desc="save evaluation result"): - for subject in score["round1"][task].keys(): - subject_result_path = os.path.join(args.save_path, "eval_result", task) - subject_result = {} - for round_idx in range(1, args.rounds + 1): - subject_result[f"round{round_idx}"] = score[f"round{round_idx}"][task][subject] - os.makedirs(subject_result_path, exist_ok=True) - fn = os.path.join(subject_result_path, f"{subject}.json") - with open(fn, "w") as f: - json.dump(subject_result, f, indent=4) - if args.rounds == 1: - task_result = score[f"round1"][task][task] - else: - task_result = {f"round{round_idx}": score[f"round{round_idx}"][task][task] for round_idx in range(1, args.rounds + 1)} - summary_score[task] = task_result - - with open(os.path.join(args.save_path, "summary.json"), "w") as f: - json.dump(summary_score, f, indent=4) - print(json.dumps(summary_score, indent=4)) - -def load_inference_results(infer_result_path: str): - """ - 从指定目录加载推断结果到字典中。 - - :param infer_result_path: 包含推断结果文件的目录路径 - :return: 包含推断结果的字典 - """ - infer_result = {} - # 遍历目录中的所有文件和文件夹 - for root, dirs, files in os.walk(infer_result_path): - for file in files: - # 构建文件的完整路径 - file_path = os.path.join(root, file) - # 检查文件是否是JSON文件 - if file.endswith('.json'): - # 从文件名中提取任务和主题 - task = os.path.basename(root) - subject = os.path.splitext(file)[0] - # 打开并读取JSON文件 - with open(file_path, 'r', encoding='utf-8') as f: - item = json.load(f) - # 将读取的数据添加到字典中 - if task not in infer_result: - infer_result[task] = {} - infer_result[task][subject] = item - return infer_result - - -def main(): - load_path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/7-25_13:09_Llama-3_dpo_1" - # load_path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/5-25_02:21_llama3_gen" - # load_path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/5-25_02:38_llama3_gen" - # load_path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/5-25_02:39_llama3_gen" - config_path = os.path.join(load_path, "config.json") - result_path = os.path.join(load_path, "infer_results_withoutscore") - args = load_args_from_config(config_path) - print(args) - inference_result = load_inference_results(result_path) - score = run_eval(inference_result, args) - save_result(inference_result, score, args) - - -if __name__ == "__main__": - torch.cuda.empty_cache() - main() diff --git a/lm_cute_eval/tasks/drop/config_drop.json b/lm_cute_eval/tasks/drop/config_drop.json index 3841015..6dc8acc 100644 --- a/lm_cute_eval/tasks/drop/config_drop.json +++ b/lm_cute_eval/tasks/drop/config_drop.json @@ -1,4 +1,5 @@ { + "instruction": "You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.Think step by step, then write a line of the form 'Answer: $ANSWER' at the end of your response.", "num_fewshot": 0, "limit": null } \ No newline at end of file diff --git a/lm_cute_eval/tasks/drop/load_data_drop.py b/lm_cute_eval/tasks/drop/load_data_drop.py index a0996f6..a93eed2 100644 --- a/lm_cute_eval/tasks/drop/load_data_drop.py +++ b/lm_cute_eval/tasks/drop/load_data_drop.py @@ -1,29 +1,19 @@ import os, json -drop_dir = os.path.join("data", "tasks", "drop") -drop_instruction = "You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.Think step by step, then write a line of the form 'Answer: $ANSWER' at the end of your response." - def load_file_drop(fn, limit=0): data = [] with open(fn, "r", encoding='utf-8') as f: - for line in f: - try: - # 尝试解析JSON - data.append(json.loads(line)) - except json.JSONDecodeError as e: - # 如果解析失败,打印错误信息并跳过当前行 - print(f"Skipping line with JSONDecodeError: {e}") - continue # 继续读取下一行 + data.append(json.loads(line)) if limit and len(data) >= limit: break return data -def get_fewshot_cot_prompt_drop(num_fewshot): +def get_fewshot_cot_prompt_drop(drop_path, num_fewshot): assert 0 <= num_fewshot <= 8 - fewshot_cot_fn = os.path.join(drop_dir, "fewshot.txt") + fewshot_cot_fn = os.path.join(drop_path, "fewshot.txt") file_str = "" with open(fewshot_cot_fn, "r") as f: for line in f: @@ -36,15 +26,16 @@ def get_fewshot_cot_prompt_drop(num_fewshot): def load_data_drop(args): + drop_path = os.path.join(args.data_path, "tasks", "drop") task_config = args.tasks_config["drop"] - test_data = load_file_drop(os.path.join(drop_dir, "test.jsonl"), task_config["limit"]) + test_data = load_file_drop(os.path.join(drop_path, "test.jsonl"), task_config["limit"]) task_data = {"drop": []} - fewshot_prompt = get_fewshot_cot_prompt_drop(task_config["num_fewshot"]) + fewshot_prompt = get_fewshot_cot_prompt_drop(drop_path, task_config["num_fewshot"]) for item in test_data: prompt = "Question: " + item["question"] + "\nAnswer: Let's think step by step\n" task_data["drop"].append({ **item, - "instruction": drop_instruction, + "instruction": task_config["instruction"], "fewshot_prompt": fewshot_prompt, "prompt_round1": prompt, }) diff --git a/lm_cute_eval/tasks/drop/match_answer_drop.py b/lm_cute_eval/tasks/drop/match_answer_drop.py index c37349a..553d682 100644 --- a/lm_cute_eval/tasks/drop/match_answer_drop.py +++ b/lm_cute_eval/tasks/drop/match_answer_drop.py @@ -2,18 +2,10 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union import re from scipy.optimize import linear_sum_assignment -import string, numpy as np +import string +import numpy as np EXCLUDE = set(string.punctuation) - -drop_data_pattern = [ - '\s*answer is\s*([A-Za-z]+)\s*', - '\s*answer is\s*(\d+\.?\d*)', - '\s*(\d+\.?\d*)\s*', - '\s*([A-Za-z]+)\s*', - ] - - def normalize(s: str) -> str: """Lower text and remove punctuation, articles and extra whitespace.""" s = s.lower() @@ -26,36 +18,35 @@ def normalize(s: str) -> str: def match_answer_drop(infer_result, round_idx, args): - exact_match_cnt = 0 - result = {} + drop_answer_patterns = [ + '\s*answer is\s*([A-Za-z]+)\s*', + '\s*answer is\s*(\d+\.?\d*)', + '\s*(\d+\.?\d*)\s*', + '\s*([A-Za-z]+)\s*', + ] + correct_cnt = 0 for item in infer_result["drop"]: - answer = [] - norm_ref_answer = normalize(item["answer"]) - answer_asnwer = re.split(r' ', norm_ref_answer) - norm_ref_text = normalize(item["ref_text"]) - answer_text = re.split(r'[|]\s*|\s+', norm_ref_text) - answer.extend(answer_asnwer) - answer.extend(answer_text) + probable_answers = [] + probable_answers.extend(normalize(item["answer"]).split()) + probable_answers.extend(re.split(r'[|]\s*|\s+', normalize(item["ref_text"]))) norm_answer_item = normalize(item[f"infer_round{round_idx}"]) - for pa in drop_data_pattern: - exact_answer = re.findall(pa, norm_answer_item) - if exact_answer: - break - item[f"judge{round_idx}"] = False - if len(exact_answer) > 0: - model_answer = exact_answer[0].split(' ') - flag = 0 - for ans1 in model_answer: - for ans2 in answer: - if ans1 == ans2: - item[f"exact_match{round_idx}"] = ans2 - exact_match_cnt += 1 - item[f"judge{round_idx}"] = True - flag = 1 - break - if flag == 1: + extracted_answers = [] + for pattern in drop_answer_patterns: + extracted_answers.extend(re.findall(pattern, norm_answer_item)) + extracted_answers = list(set(extracted_answers)) + item[f"extracted_answer_round{round_idx}"] = extracted_answers + item[f"judge_round{round_idx}"] = False + for extracted_answer in extracted_answers: + for word in extracted_answer.split(): + if word in probable_answers: + correct_cnt += 1 + item[f"judge_round{round_idx}"] = True break - result["drop"] = { - "exact_match": exact_match_cnt / len(infer_result["drop"]), + if item[f"judge_round{round_idx}"]: + break + result = { + "drop": { + "acc": correct_cnt / len(infer_result["drop"]), + } } return result \ No newline at end of file diff --git a/lm_cute_eval/tasks/gsm8k/config_gsm8k.json b/lm_cute_eval/tasks/gsm8k/config_gsm8k.json index fb53af6..27fab0c 100644 --- a/lm_cute_eval/tasks/gsm8k/config_gsm8k.json +++ b/lm_cute_eval/tasks/gsm8k/config_gsm8k.json @@ -1,4 +1,5 @@ { + "instruction": "Solve the following math questions. Please think step by step and finally give the answer.\n\n", "num_fewshot": 8, "limit": null } \ No newline at end of file diff --git a/lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py b/lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py index aac2576..5dcc374 100644 --- a/lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py +++ b/lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py @@ -37,18 +37,16 @@ def get_fewshot_cot_prompt_gsm8k(gsm8k_dir, num_fewshot): def load_data_gsm8k(args): - gsm8k_dir = os.path.join("data", "tasks", "gsm8k") - gsm8k_instruction = "Solve the following math questions. Please think step by step and finally give the answer.\n\n" - + gsm8k_path = os.path.join(args.data_path, "tasks", "gsm8k") task_config = args.tasks_config["gsm8k"] - test_data = load_file_gsm8k(os.path.join(gsm8k_dir, "test.jsonl"), task_config["limit"]) + test_data = load_file_gsm8k(os.path.join(gsm8k_path, "test.jsonl"), task_config["limit"]) task_data = {"gsm8k": []} - fewshot_prompt = get_fewshot_cot_prompt_gsm8k(gsm8k_dir, task_config["num_fewshot"]) + fewshot_prompt = get_fewshot_cot_prompt_gsm8k(gsm8k_path, task_config["num_fewshot"]) for item in test_data: prompt = "Question: " + item["question"] + "\nAnswer: Let's think step by step\n" task_data["gsm8k"].append({ **item, - "instruction": gsm8k_instruction, + "instruction": task_config["instruction"], "fewshot_prompt": fewshot_prompt, "prompt_round1": prompt, }) diff --git a/lm_cute_eval/tasks/gsm8k/match_answer_gsm8k.py b/lm_cute_eval/tasks/gsm8k/match_answer_gsm8k.py index 9799b8f..e8b7e04 100644 --- a/lm_cute_eval/tasks/gsm8k/match_answer_gsm8k.py +++ b/lm_cute_eval/tasks/gsm8k/match_answer_gsm8k.py @@ -1,27 +1,24 @@ import re -""" -exact_match: Match answer after 'The answer is #### ' -flexible_match: Match every number in the response, if any number equals to the answer, the answer is correct. -""" +# exact_match: Match answer after 'The answer is #### ' +# flexible_match: Match every number in the response, if any number equals to the answer, the answer is correct. -number_pattern = r'(-?\d+(?:,\d{3})*(?:\.\d+)?)' -gsm8k_data_pattern = "#### " + number_pattern -exact_pattern = r'The answer is[:\s#\$]*\s*' + number_pattern -flexible_pattern = number_pattern def str_to_float(text: str): - """convert string like '1,234.00' to float""" + # convert string like '1,234.00' to float return float(text.replace(",", "")) def match_answer_gsm8k(infer_result, round_idx, args): + number_pattern = r'(-?\d+(?:,\d{3})*(?:\.\d+)?)' + gsm8k_data_pattern = "#### " + number_pattern + exact_pattern = r'The answer is[:\s#\$]*\s*' + number_pattern + flexible_pattern = number_pattern exact_match_cnt = 0 flexible_match_cnt = 0 result = {} for item in infer_result["gsm8k"]: answer = str_to_float(re.findall(gsm8k_data_pattern, item["answer"])[0]) - # match answer after 'The answer is #### ' exact_answer = re.findall(exact_pattern, item[f"infer_round{round_idx}"]) item[f"judge{round_idx}"] = False diff --git a/lm_cute_eval/tasks/hellaswag/config_hellaswag.json b/lm_cute_eval/tasks/hellaswag/config_hellaswag.json index 1bf0057..fc64534 100644 --- a/lm_cute_eval/tasks/hellaswag/config_hellaswag.json +++ b/lm_cute_eval/tasks/hellaswag/config_hellaswag.json @@ -1,4 +1,6 @@ { - "num_fewshot": 7, + "instruction": "Here are some multiple-choice questions about continuation writing. Each question contains a paragraph and four options for possible continuations. Choose the most appropriate continuation from options A, B, C, and D.\n\n\n", + "question_template": "Question: {question}\nOptions:\n(A) {A}\n(B) {B}\n(C) {C}\n(D) {D}\nAnswer: The most appropriate continuation is ", + "num_fewshot": 4, "limit": null } \ No newline at end of file diff --git a/lm_cute_eval/tasks/hellaswag/load_data_hellaswag.py b/lm_cute_eval/tasks/hellaswag/load_data_hellaswag.py index 928e654..fa32495 100644 --- a/lm_cute_eval/tasks/hellaswag/load_data_hellaswag.py +++ b/lm_cute_eval/tasks/hellaswag/load_data_hellaswag.py @@ -1,9 +1,7 @@ import os, json -def format_query_hellaswag(data, has_answer): - question_template = "Question: {question}\nOptions:\n(A) {A}\n(B) {B}\n(C) {C}\n(D) {D}\nAnswer: The most appropriate continuation is " - +def format_query_hellaswag(question_template, data, has_answer): prompt = question_template.format( question=data["Q"], A=data["A"], @@ -32,32 +30,30 @@ def load_file_hellaswag(fn, limit=None): return data -def get_fewshot_prompt_hellaswag(hellaswag_dir, num_fewshot): +def get_fewshot_prompt_hellaswag(hellaswag_path, question_template, num_fewshot): assert 0 <= num_fewshot <= 25 fewshot_prompt = "" - fewshot_fn = os.path.join(hellaswag_dir, "hellaswag_train_sampled25.jsonl") + fewshot_fn = os.path.join(hellaswag_path, "hellaswag_train_sampled25.jsonl") fewshot_data = load_file_hellaswag(fewshot_fn, num_fewshot) for item in fewshot_data: - fewshot_prompt += format_query_hellaswag(item, True) - + fewshot_prompt += format_query_hellaswag(question_template, item, True) return fewshot_prompt def load_data_hellaswag(args): - hellaswag_dir = os.path.join(args.data_path, "tasks", "hellaswag") - hellaswag_instruction = "Here are some multiple-choice questions about continuation writing. Each question contains a paragraph and four options for possible continuations. Choose the most appropriate continuation from options A, B, C, and D.\n\n\n" - + hellaswag_path = os.path.join(args.data_path, "tasks", "hellaswag") task_config = args.tasks_config["hellaswag"] + question_template = task_config["question_template"] task_data = {} - test_fn = os.path.join(hellaswag_dir, "hellaswag.jsonl") + test_fn = os.path.join(hellaswag_path, "hellaswag.jsonl") test_data = load_file_hellaswag(test_fn, task_config["limit"]) - fewshot_prompt = get_fewshot_prompt_hellaswag(hellaswag_dir, task_config["num_fewshot"]) + fewshot_prompt = get_fewshot_prompt_hellaswag(hellaswag_path, question_template, task_config["num_fewshot"]) data = [] for item in test_data: - prompt = format_query_hellaswag(item, False) + prompt = format_query_hellaswag(question_template, item, False) data.append({ **item, - "instruction": hellaswag_instruction, + "instruction": task_config["instruction"], "fewshot_prompt": fewshot_prompt, "prompt_round1": prompt, }) diff --git a/lm_cute_eval/tasks/humaneval/config_humaneval.json b/lm_cute_eval/tasks/humaneval/config_humaneval.json index afb04f8..5320457 100644 --- a/lm_cute_eval/tasks/humaneval/config_humaneval.json +++ b/lm_cute_eval/tasks/humaneval/config_humaneval.json @@ -1,3 +1,4 @@ { + "instruction": "Please complete the following python functions and output the entire function within a python code block, without any explainations.\n\n\n", "limit": null } \ No newline at end of file diff --git a/lm_cute_eval/tasks/humaneval/load_data_humaneval.py b/lm_cute_eval/tasks/humaneval/load_data_humaneval.py index d874c50..3a6408c 100644 --- a/lm_cute_eval/tasks/humaneval/load_data_humaneval.py +++ b/lm_cute_eval/tasks/humaneval/load_data_humaneval.py @@ -2,8 +2,8 @@ from .human_eval.data import read_problems -def get_fewshot_prompt(humaneval_dir): - fewshot_fn = os.path.join(humaneval_dir, "fewshot_prompt.txt") +def get_fewshot_prompt(humaneval_path): + fewshot_fn = os.path.join(humaneval_path, "fewshot_prompt.txt") fewshot_prompt = "" with open(fewshot_fn, "r") as f: for line in f: @@ -16,16 +16,15 @@ def format_humaneval_prompt(question:str): def load_data_humaneval(args): - humaneval_instruction = "Please complete the following python functions and output the entire function within a python code block, without any explainations.\n\n\n" - humaneval_dir = os.path.join(args.data_path, "tasks", "humaneval") + humaneval_path = os.path.join(args.data_path, "tasks", "humaneval") task_config = args.tasks_config["humaneval"] - data = read_problems(os.path.join(humaneval_dir, "HumanEval.jsonl.gz")) + data = read_problems(os.path.join(humaneval_path, "HumanEval.jsonl.gz")) task_data = {"humaneval": []} - fewshot_prompt = get_fewshot_prompt(humaneval_dir) + fewshot_prompt = get_fewshot_prompt(humaneval_path) for humaneval_id, item in data.items(): task_data["humaneval"].append({ **item, - "instruction": humaneval_instruction, + "instruction": task_config["instruction"], "fewshot_prompt": fewshot_prompt, "prompt_round1": format_humaneval_prompt(item["prompt"]) }) diff --git a/lm_cute_eval/tasks/icleval/match_answer_icleval.py b/lm_cute_eval/tasks/icleval/match_answer_icleval.py index 4b075d8..bf9192f 100644 --- a/lm_cute_eval/tasks/icleval/match_answer_icleval.py +++ b/lm_cute_eval/tasks/icleval/match_answer_icleval.py @@ -5,6 +5,7 @@ def match_answer_icleval(infer_result:dict, round_idx:int, args): subject_correct_cnt = 0 total_cnt += len(subject_result) for item in subject_result: + item[f"judge_round{round_idx}"] = False ans = str(item["label"]).strip() if subject == "generate_output_format": ans.replace("value", "key") @@ -13,6 +14,7 @@ def match_answer_icleval(infer_result:dict, round_idx:int, args): else: ans.replace("key", item["ans_content"]) if ans in item[f"infer_round{round_idx}"]: + item[f"judge_round{round_idx}"] = True subject_correct_cnt += 1 result[subject] = { "acc": subject_correct_cnt / len(subject_result) diff --git a/lm_cute_eval/tasks/mmlu/match_answer_mmlu.py b/lm_cute_eval/tasks/mmlu/match_answer_mmlu.py index 4ad2441..9b31804 100644 --- a/lm_cute_eval/tasks/mmlu/match_answer_mmlu.py +++ b/lm_cute_eval/tasks/mmlu/match_answer_mmlu.py @@ -1,33 +1,31 @@ import re from ..match_answer import find_first_selection -mmlu_pattern = r"The\s+answer\s+is\s+[\(\[\{]*([ABCD])[\)\]\}]*\.?" + def match_answer_mmlu(infer_result:dict, round_idx, args): + # mmlu_pattern = r"The\s+answer\s+is\s+[\(\[\{]*([ABCD])[\)\]\}]*\.?" task_config = args.tasks_config["mmlu"] result = {} for subject in task_config["subjects"]: correct_cnt = 0 - total_cnt = 0 for item in infer_result[subject]: - l = re.findall(mmlu_pattern, item[f"infer_round{round_idx}"]) - if len(l) > 0: - model_answer = l[0][0] - else: - model_answer = find_first_selection(item[f"infer_round{round_idx}"]) + # l = re.search(mmlu_pattern, item[f"infer_round{round_idx}"]) + # if len(l) > 0: + # model_answer = l[0] + # else: + # model_answer = find_first_selection(item[f"infer_round{round_idx}"]) + model_answer = find_first_selection(item[f"infer_round{round_idx}"]) item[f"extract_answer_round{round_idx}"] = model_answer - item[f"judge{round_idx}"] = False - if not item[f"extract_answer_round{round_idx}"]: - continue - total_cnt+=1 + item[f"judge_round{round_idx}"] = False if model_answer == item["ans"]: correct_cnt += 1 - item[f"judge{round_idx}"] = True + item[f"judge_round{round_idx}"] = True - subject_result = correct_cnt / total_cnt + subject_result = correct_cnt / len(infer_result[subject]) result[subject] = { "acc": subject_result, "correct_cnt": correct_cnt, - "tot_cnt": total_cnt + "tot_cnt": len(infer_result[subject]) } result["mmlu"] = { diff --git a/lm_cute_eval/tasks/xsum/config_xsum.json b/lm_cute_eval/tasks/xsum/config_xsum.json index cfa6ec4..354994a 100644 --- a/lm_cute_eval/tasks/xsum/config_xsum.json +++ b/lm_cute_eval/tasks/xsum/config_xsum.json @@ -1,4 +1,5 @@ { + "instruction": "You will be asked to read a dialog and summary the dialog. Some examples of dialogs and summaries are provided below.Think step by step, then write a summary of the form 'Answer: $ANSWER' at the end of your response.", "num_fewshot": 1, "limit": null } \ No newline at end of file diff --git a/lm_cute_eval/tasks/xsum/load_data_xsum.py b/lm_cute_eval/tasks/xsum/load_data_xsum.py index 796301a..7d5319f 100644 --- a/lm_cute_eval/tasks/xsum/load_data_xsum.py +++ b/lm_cute_eval/tasks/xsum/load_data_xsum.py @@ -1,28 +1,19 @@ import os, json -xsum_dir = os.path.join("data", "tasks", "xsum") -xsum_instruction = "You will be asked to read a dialog and summary the dialog. Some examples of dialogs and summaries are provided below.Think step by step, then write a summary of the form 'Answer: $ANSWER' at the end of your response." - - def load_file_xsum(fn, limit=0): data = [] with open(fn, "r", encoding='utf-8') as f: - for line in f: - try: - data.append(json.loads(line)) - except json.JSONDecodeError as e: - print(f"Skipping line with JSONDecodeError: {e}") - continue + data.append(json.loads(line)) if limit and len(data) >= limit: break return data -def get_fewshot_cot_prompt_xsum(num_fewshot): +def get_fewshot_cot_prompt_xsum(xsum_path, num_fewshot): assert 0 <= num_fewshot <= 8 - fewshot_cot_fn = os.path.join(xsum_dir, "fewshot.txt") + fewshot_cot_fn = os.path.join(xsum_path, "fewshot.txt") file_str = "" with open(fewshot_cot_fn, "r") as f: for line in f: @@ -35,15 +26,16 @@ def get_fewshot_cot_prompt_xsum(num_fewshot): def load_data_xsum(args): + xsum_path = os.path.join(args.data_path, "tasks", "xsum") task_config = args.tasks_config["xsum"] - test_data = load_file_xsum(os.path.join(xsum_dir, "test.jsonl"), task_config["limit"]) + test_data = load_file_xsum(os.path.join(xsum_path, "test.jsonl"), task_config["limit"]) task_data = {"xsum": []} - fewshot_prompt = get_fewshot_cot_prompt_xsum(task_config["num_fewshot"]) + fewshot_prompt = get_fewshot_cot_prompt_xsum(xsum_path, task_config["num_fewshot"]) for item in test_data: prompt = "Question: " + item["question"] + "\nAnswer:" task_data["xsum"].append({ **item, - "instruction": xsum_instruction, + "instruction": task_config["instruction"], "fewshot_prompt": fewshot_prompt, "prompt_round1": prompt, }) diff --git a/lm_cute_eval/tasks/xsum/match_answer_xsum.py b/lm_cute_eval/tasks/xsum/match_answer_xsum.py index 1382860..b67195c 100644 --- a/lm_cute_eval/tasks/xsum/match_answer_xsum.py +++ b/lm_cute_eval/tasks/xsum/match_answer_xsum.py @@ -1,78 +1,21 @@ - -# ''' -# # { -# # 'colbert': [0.7796499729156494, 0.4621465802192688, 0.4523794651031494, 0.7898575067520142], -# # 'sparse': [0.195556640625, 0.00879669189453125, 0.0, 0.1802978515625], -# # 'dense': [0.6259765625, 0.347412109375, 0.349853515625, 0.67822265625], -# # 'sparse+dense': [0.482503205537796, 0.23454029858112335, 0.2332356721162796, 0.5122477412223816], -# # 'colbert+sparse+dense': [0.6013619303703308, 0.3255828022956848, 0.32089319825172424, 0.6232916116714478] -# # } -# ''' # from FlagEmbedding import BGEM3FlagModel -def calc_similarity(text1:str, text2:str): - - text1 = [text1] - text2 = [text2] - text1_ids = model.encode(text1, batch_size=256, max_length=1024, )['dense_vecs'] - text2_ids = model.encode(text2, batch_size=256, max_length=1024, )['dense_vecs'] - similarity = text1_ids @ text2_ids.T - if similarity > 0.8: - return True - else: - return False - -def match_answer_xsum(infer_result, round_idx, args): - exact_match_cnt = 0 - result = {} - answer = [] - exact_answer = [] - # model = BGEM3FlagModel('/home/admin/workspace/aop_lab/app_source/dcy/download/model/BAAI/bge-m3', use_fp16=True) - # for item in infer_result["xsum"]: - # answer.append(item["answer"]) - # exact_answer.append(item[f"infer_round{round_idx}"]) - # text1_ids = model.encode(exact_answer, batch_size=256, max_length=1024, )['dense_vecs'] - # text2_ids = model.encode(answer, batch_size=256, max_length=1024, )['dense_vecs'] - # similarities = text1_ids @ text2_ids.T - # for similarity in similarities: - # exact_match_cnt+= similarity - # item[f"exact_match{round_idx}"] = similarity - - result["xsum"] = { - "similarity": (exact_match_cnt / len(infer_result["xsum"])), - } - return result -# import json -# import os -# os.environ["CUDA_VISIBLE_DEVICES"] = "6" -# if __name__ == "__main__": -# path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/6-17_17:46_Llama-2-13b-chat-hf/infer_results/xsum/xsum.json" - -# exact_match_cnt = 0 +# def match_answer_xsum(infer_result, round_idx, args): +# similarity_sum = 0 # result = {} -# answer = [] -# exact_answer1 = [] -# exact_answer2 = [] -# with open(path, "r") as f: -# xsum_result = json.load(f) -# eval_result = xsum_result -# model = BGEM3FlagModel('/data1/dcy/downloads/model/BAAI/bge-m3', use_fp16=True) -# for item in xsum_result: -# answer.append(item["answer"]) -# exact_answer1.append(item[f"infer_round1"]) -# exact_answer2.append(item[f"infer_round2"]) -# exact_answer1_ids = model.encode(exact_answer1, batch_size=256, max_length=1024, )['dense_vecs'] -# exact_answer2_ids = model.encode(exact_answer2, batch_size=256, max_length=1024, )['dense_vecs'] -# text2_ids = model.encode(answer, batch_size=256, max_length=1024, )['dense_vecs'] -# similarities1 = exact_answer1_ids @ text2_ids.T -# similarities2 = exact_answer2_ids @ text2_ids.T -# for similarity in similarities1: -# exact_match_cnt+= similarity -# eval_result[f"exact_match1"] = similarity -# for similarity in similarities2: -# exact_match_cnt+= similarity -# eval_result[f"exact_match2"] = similarity +# similarity_model = BGEM3FlagModel("/data2/dcy/downloads/model/BAAI/bge-m3", use_fp16=True) +# for item in infer_result["xsum"]: +# label_ids = similarity_model.encode(item["answer"], batch_size=256, max_length=1024, )['dense_vecs'] +# model_answer_ids = similarity_model.encode(item[f"infer_round{round_idx}"], batch_size=256, max_length=1024, )['dense_vecs'] +# similarity = float(label_ids @ model_answer_ids.T) +# similarity_sum += similarity +# item[f"similarity_round{round_idx}"] = similarity - - \ No newline at end of file +# result["xsum"] = { +# "similarity": similarity_sum / len(infer_result["xsum"]), +# } +# return result + +def match_answer_xsum(infer_result, round_idx, args): + return {"xsum": "skipped"} \ No newline at end of file diff --git a/run.sh b/run.sh index 14d7794..1a09019 100755 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=3 export TOKENIZERS_PARALLELISM=false declare -A models=( @@ -15,12 +15,17 @@ for model_name in "${!models[@]}"; do model_path=${models[$model_name]} python main.py \ --model_path "$model_path" \ - --model_type hf \ + --model_type vllm \ --format_type default \ - --tasks drop \ + --tasks all \ --save_name "$model_name" \ --save_infer_texts \ - --config_path config_debug.json \ + --save_infer_results \ + --config_path "config_debug.json" \ --output_path output/debug \ - --use_cpu + --max_new_tokens 180 \ + --temperature 0.1 \ + --top_p 0.2 \ + --top_k 20 \ + done