From 36021fcac74c7e8556e0e91060f730f6ce25c678 Mon Sep 17 00:00:00 2001
From: 939051420 <939051420@qq.com>
Date: Wed, 4 Sep 2024 23:04:47 +0800
Subject: [PATCH] =?UTF-8?q?merge,=20=E5=A2=9E=E5=8A=A0=E4=BA=86drop?=
 =?UTF-8?q?=E5=92=8Cxsum=EF=BC=8C=E5=AE=8C=E5=96=84=E4=BA=86=E8=BE=93?=
 =?UTF-8?q?=E5=87=BA=EF=BC=8C=E4=BF=AE=E5=A4=8D=E4=BA=86=E6=8E=A8=E7=90=86?=
 =?UTF-8?q?=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                                     | 101 +++++---
 config.json                                   |   8 +
 config_debug.json                             |  26 +-
 lm_cute_eval/get_multiround_prompt.py         |   3 +
 lm_cute_eval/model.py                         |  24 +-
 lm_cute_eval/run.py                           |  48 ++--
 lm_cute_eval/run_score.py                     | 241 ------------------
 lm_cute_eval/tasks/drop/config_drop.json      |   1 +
 lm_cute_eval/tasks/drop/load_data_drop.py     |  23 +-
 lm_cute_eval/tasks/drop/match_answer_drop.py  |  67 +++--
 lm_cute_eval/tasks/gsm8k/config_gsm8k.json    |   1 +
 lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py   |  10 +-
 .../tasks/gsm8k/match_answer_gsm8k.py         |  17 +-
 .../tasks/hellaswag/config_hellaswag.json     |   4 +-
 .../tasks/hellaswag/load_data_hellaswag.py    |  24 +-
 .../tasks/humaneval/config_humaneval.json     |   1 +
 .../tasks/humaneval/load_data_humaneval.py    |  13 +-
 .../tasks/icleval/match_answer_icleval.py     |   2 +
 lm_cute_eval/tasks/mmlu/match_answer_mmlu.py  |  26 +-
 lm_cute_eval/tasks/xsum/config_xsum.json      |   1 +
 lm_cute_eval/tasks/xsum/load_data_xsum.py     |  22 +-
 lm_cute_eval/tasks/xsum/match_answer_xsum.py  |  89 ++-----
 run.sh                                        |  15 +-
 23 files changed, 237 insertions(+), 530 deletions(-)
 create mode 100644 lm_cute_eval/get_multiround_prompt.py
 delete mode 100644 lm_cute_eval/run_score.py

diff --git a/README.md b/README.md
index 11430fd..e62d69c 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # lm-cute-eval：一个轻量级的大语言模型评测框架
 
-这是一个轻量级的大语言模型评测框架，目前支持少量常用评测集，其优点在于不同任务模块之间解耦，扩展性强，可以较方便地添加新的任务。该评测框架使用vllm库进行推理，暂时不支持使用transformers自带的推理函数。
+这是一个轻量级的大语言模型评测框架，目前支持少量常用评测集，其优点在于不同任务模块之间解耦，扩展性强，可以较方便地添加新的任务。该评测框架使用transformers和vllm库进行推理。
 
 ## 开始运行
 
@@ -21,69 +21,89 @@ unzip data.zip
 编辑run.sh脚本，需要考虑的参数如下：
 
 ```
+模型配置
 model_path: 模型绝对路径
-model_type: 模型类型，用于控制prompt格式，默认default
-sampling_params: vllm推理框架使用的参数
-tasks: 需要评测的任务名称，用空格隔开，例如需要评测mmlu和gsm8k，则在命令中加入--tasks gsm8k mmlu
-save_name: 输出文件夹名称
+model_type: 模型变量类型，默认vllm，目前可选vllm和hf
+format_type: 模型类型，用于控制prompt格式，默认default
+
+任务配置
+tasks: 需要评测的任务名称，用空格隔开，例如需要评测mmlu和gsm8k，则在命令中加入--tasks gsm8k mmlu，也可以包含一个all，自动评测所有任务。
+config_path: 任务配置文件路径，缺失值自动填充为对应任务文件夹中的默认config。
+data_path(不需要修改): 数据集路径。
+
+保存配置
+output_path: 输出目录，默认output
+save_name: 输出文件夹名称。
 save_infer_results: 保存推理结果，而非只保存一个分数
-config_path: 任务配置文件路径
-output_path: 输出目录，默认./output
+save_infer_texts: 保存便于阅读的输入输出文本到infer_result{round_idx}.txt
 no_timestamp: 输出文件夹不包含时间，若包含时间，则会保存到"./output/{time}_{model_name}/"中
+temp_file_path(不需要修改): 临时文件保存目录，主要用于humaneval评测集。
+
+推理配置
+rounds: 推理轮数（用于其他实验，需要自己控制中间对话的prompt，具体见lm_cute_eval/get_multiround_prompt.py。
+seed: 随机种子。
+use_cpu(不需要使用): 使用CPU推理(用于debug)。
+temperature:模型推理参数
+top_p:模型推理参数
+top_k:模型推理参数
+max_new_tokens: 最多生成的token数量，默认160，不同数据集不一样，且本框架不可以分开设置每个任务的new token数量，所以取了个较大的值。
 ```
 
-可能不需要考虑的参数：
 
-```
-rounds: 推理轮数，默认1，仅用于其他实验
-refine_prompt: 多轮推理过程中的prompt，仅用于其他实验
-temp_file_path: 临时文件保存目录，主要用于humaneval评测集
-```
 
-例如你想要评测mmlu和gsm8k：
+例如你想用mmlu和gsm8k评测两个模型：
 
 ```bash
-export CUDA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=3
 export TOKENIZERS_PARALLELISM=false
 
 declare -A models=(
-    ["model_name"]="model_path"
+   	["model_name1"]="model_path1"
+	["model_name2"]="model_path2"
 )
 
 for model_name in "${!models[@]}"; do
     model_path=${models[$model_name]}
     python main.py \
         --model_path "$model_path" \
-        --model_type default \
-        --tasks gsm8k mmlu \
+        --model_type vllm \
+        --format_type default \
+        --tasks mmlu gsm8k \
         --save_name "$model_name" \
+        --save_infer_texts \
         --save_infer_results \
-        --config_path config.json \
-        --output_path output
+        --config_path "config.json" \
+        --output_path output/debug \
+        --max_new_tokens 180 \
+        --temperature 0.1 \
+        --top_p 0.2 \
+        --top_k 20 \
+
 done
+
 ```
 
 配置config：
 
-根目录下，有默认config.json文件，可以根据需要来修改config中的内容，其格式如下：
+根目录下，有默认config.json文件，在每个数据集的config文件中也又默认值，可以根据需要来修改config中的内容，其格式如下：
 
 ```
 {
-	task_name_1: task_config_1,
-	task_name_2: taks_config_2,
-	...
+    task_name_1: task_config_1,
+    task_name_2: taks_config_2,
+    ...
 }
 
 例如:
 {
-	"gsm8k": {
-		"num_fewshot": 8,
-		"limit": 0
-	}
-	"mmlu": {
-		"num_fewshot": 5,
-		"limit": null
-	}
+    "gsm8k": {
+        "num_fewshot": 8,
+        "limit": 0
+    }
+    "mmlu": {
+        "num_fewshot": 5,
+        "limit": null
+    }
 }
 ```
 
@@ -97,7 +117,7 @@ subjects: (list) 需要评测的子任务的名称列表，例如mmlu中有abstr
 
 
 
-## 评测任务细节
+## 评测任务介绍
 
 ### arc
 
@@ -135,7 +155,7 @@ flexible_match：匹配回答中任何数字，有一个正确则为正确。
 
 ### icleval
 
-数据集来源：使用原始数据（[ICLEval/data/tasks_data](https://github.com/yiye3/ICLEval/tree/main/data/tasks_data)），修复了部分文件中的错误，且不支持copy_dict_search_string.json、copy_natural_language_string.json这两个子任务。
+数据集来源：[ICLEval/data/tasks_data](https://github.com/yiye3/ICLEval/tree/main/data/tasks_data)，修复了部分文件中"examples"写成"exmaples"的拼写错误。
 
 评测指标：若标准答案是模型生成文本的子串，则为正确。
 
@@ -151,3 +171,16 @@ flexible_match：匹配回答中任何数字，有一个正确则为正确。
 
 评测指标：匹配回答中的第一个选项，判断是否正确。
 
+### rgb
+
+数据集来源：[chen700564/RGB](https://github.com/chen700564)
+
+评测指标：有一个可能的回答出现在答案里即为正确。
+
+### xsum
+
+数据集来源：[新建标签页 (github.com)](https://github.com/EdinburghNLP/XSum)
+
+评测指标：用BAAI/bge-m3模型计算和答案的相似度。
+
+注意，该功能由于依赖包较复杂，暂未完善，全部被注释掉了，自己配好环境是可以用的。
\ No newline at end of file
diff --git a/config.json b/config.json
index 9049ef5..27ff4a9 100644
--- a/config.json
+++ b/config.json
@@ -1,4 +1,12 @@
 {
+    "arc": {
+        "arc_e": {
+            "num_fewshot": 0
+        },
+        "arc_c": {
+            "num_fewshot": 25
+        }
+    },
     "commonsenseqa": {
         "num_fewshot": 7
     },
diff --git a/config_debug.json b/config_debug.json
index 4926485..ee81446 100644
--- a/config_debug.json
+++ b/config_debug.json
@@ -2,42 +2,48 @@
     "arc": {
         "arc_e": {
             "num_fewshot": 0,
-            "limit": 10
+            "limit": 1
         },
         "arc_c": {
             "num_fewshot": 25,
-            "limit": 10
+            "limit": 1
         }
     },
     "commonsenseqa": {
         "num_fewshot": 7,
-        "limit": 10
+        "limit": 1
     },
     "drop": {
         "num_fewshot": 5,
-        "limit": 10
+        "limit": 1
     },
     "gsm8k": {
         "num_fewshot": 8,
-        "limit": 10
+        "limit": 1
     },
     "hellaswag": {
         "num_fewshot": 4,
-        "limit": 10
+        "limit": 1
     },
     "humaneval": {
-        "limit": 10
+        "limit": 1
+    },
+    "icleval": {
+        "limit": 1
     },
     "mmlu": {
         "num_fewshot": 5,
-        "limit": 10
+        "limit": 1
+    },
+    "rgb": {
+        "limit": 1
     },
     "winogrande": {
         "num_fewshot": 5,
-        "limit": 10
+        "limit": 1
     },
     "xsum":{
         "num_fewshot": 5,
-        "limit": 10
+        "limit": 1
     }
 }
\ No newline at end of file
diff --git a/lm_cute_eval/get_multiround_prompt.py b/lm_cute_eval/get_multiround_prompt.py
new file mode 100644
index 0000000..8941fef
--- /dev/null
+++ b/lm_cute_eval/get_multiround_prompt.py
@@ -0,0 +1,3 @@
+def get_multiround_prompt(round_idx, args):
+    refine_prompt = "Please further think about and give me a more precise and professional answer.\n"
+    return refine_prompt
\ No newline at end of file
diff --git a/lm_cute_eval/model.py b/lm_cute_eval/model.py
index 55d40bf..b661113 100644
--- a/lm_cute_eval/model.py
+++ b/lm_cute_eval/model.py
@@ -15,8 +15,8 @@ def __init__(self, args) -> None:
         sampling_kwargs = {
             "top_p": args.top_p,
             "top_k": args.top_k,
-            "max_tokens": args.max_new_tokens,
             "temperature": args.temperature,
+            "max_tokens": args.max_new_tokens,
             "stop": [       
                 "Question:",
                 "</s>",
@@ -29,12 +29,7 @@ def __init__(self, args) -> None:
                 "Input"
             ]
         }
-        if args.top_p:
-            sampling_kwargs.update({"top_p": args.top_p})
-        if args.temperature:
-            sampling_kwargs.update({"temperature": args.temperature})
-        if args.top_k:
-            sampling_kwargs.update({"top_K": args.top_k})
+        sampling_kwargs = {k: v for k, v in sampling_kwargs.items() if v is not None}
         self.sampling_params = SamplingParams(**sampling_kwargs)
     
     def generate(self, prompts):
@@ -53,15 +48,12 @@ def __init__(self, args) -> None:
         self.generate_kwargs = {
             "max_new_tokens": args.max_new_tokens,
             "pad_token_id": self.tokenizer.pad_token_id,
+            "temperature": args.temperature,
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "do_sample": True,            
         }
-
-        if args.temperature or args.top_p or args.top_k:
-            self.generate_kwargs.update({
-                "temperature": args.temperature,
-                "top_p": args.top_p,
-                "top_k": args.top_k,
-                "do_sample": True,
-            })
+        self.generate_kwargs = {k: v for k, v in self.generate_kwargs.items() if v is not None}
         self.model = AutoModelForCausalLM.from_pretrained(args.model_path).to(self.device)
 
     def generate(self, prompts):
@@ -73,7 +65,7 @@ def generate(self, prompts):
                 **self.generate_kwargs
             )
             output = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-            generated_texts.append(output.strip())
+            generated_texts.append(output[len(prompt):].strip())
         return generated_texts
 
 
diff --git a/lm_cute_eval/run.py b/lm_cute_eval/run.py
index 20b8cd5..a405359 100644
--- a/lm_cute_eval/run.py
+++ b/lm_cute_eval/run.py
@@ -5,7 +5,9 @@
 from collections import defaultdict
 from tqdm import tqdm
 import os
+import torch
 
+from .get_multiround_prompt import get_multiround_prompt
 from .model import initialize_model
 from .utils import TASK_LIST, MODEL_FORMAT, LOAD_TASK_DATA, MATCH_TASK_ANSWER
 
@@ -103,6 +105,7 @@ def run_infer(tasks_data:dict, model, args):
                 for item in tasks_data[task][subject]:
                     if round_idx == 1:
                         prompt = item["instruction"] + item["fewshot_prompt"] + item["prompt_round1"]
+                        prompt = MODEL_FORMAT[args.format_type](prompt, history=[])
                     else:                    
                         history = []
                         for i in range(1, round_idx):
@@ -138,43 +141,19 @@ def run_infer(tasks_data:dict, model, args):
         for task in tasks_data:
             for subject in tasks_data[task]:
                 for item in tasks_data[task][subject]:
-                    item[f"prompt_round{round_idx + 1}"] = args.refine_prompt
+                    item[f"prompt_round{round_idx + 1}"] = get_multiround_prompt(round_idx + 1, args)
     
     return infer_result
-# from FlagEmbedding import BGEM3FlagModel
+
 
 def run_eval(infer_results, args):
     result = defaultdict(dict)
     for round_idx in range(1, args.rounds + 1):
         result[f"round{round_idx}"] = {}
-        if "xsum" in args.tasks:
-            torch.cuda.empty_cache()
-            # model = BGEM3FlagModel('/data1/dcy/downloads/model/BAAI/bge-m3', use_fp16=True)
-            # args.model = model
         for task in args.tasks:
-            if task == "xsum":
-                continue
             result[f"round{round_idx}"][task] = MATCH_TASK_ANSWER[task](infer_results[task], round_idx, args)
     return result
 
-def save_result_inference(infer_result:dict, args):
-    """
-        infer_result: dict[task(str), dict[subject(str), item(dict)]]
-        score: dict[round{i}(str), dict[task(str), dict[subject(str), item(dict)]]]
-    """
-    # save infer results in file
-    if args.save_infer_results:
-        infer_result_path = os.path.join(args.save_path, "infer_results_withoutscore")
-        os.makedirs(infer_result_path, exist_ok=True)
-        for task in infer_result:
-            task_path = os.path.join(infer_result_path, task)
-            os.makedirs(task_path, exist_ok=True)
-            for subject in infer_result[task]:
-                subject_filename = os.path.join(task_path, f"{subject}.json")
-                with open(subject_filename, "w") as f:
-                    json.dump(infer_result[task][subject], f, ensure_ascii=False, indent=4)
-
-
 
 def save_result(infer_result:dict, score:dict, args):
     """
@@ -195,8 +174,10 @@ def save_result(infer_result:dict, score:dict, args):
 
     # save evaluation result
     summary_score = {}
+    summary_score_with_subjects = {}
     for task in args.tasks:
-        for subject in score["round1"][task].keys():
+        task_result_with_subjects = {}
+        for subject in infer_result[task]:
             subject_result_path = os.path.join(args.save_path, "eval_result", task)
             subject_result = {}
             for round_idx in range(1, args.rounds + 1):
@@ -205,14 +186,23 @@ def save_result(infer_result:dict, score:dict, args):
             fn = os.path.join(subject_result_path, f"{subject}.json")
             with open(fn, "w") as f:
                 json.dump(subject_result, f, indent=4)
+            
+            if args.rounds == 1:
+                task_result_with_subjects[subject] = subject_result["round1"]
+            else:
+                task_result_with_subjects[subject] = {f"round{round_idx}": subject_result[f"round{round_idx}"] for round_idx in range(1, args.rounds + 1)}
+
         if args.rounds == 1:
             task_result = score[f"round1"][task][task]
         else:
             task_result = {f"round{round_idx}": score[f"round{round_idx}"][task][task] for round_idx in range(1, args.rounds + 1)}
         summary_score[task] = task_result
+        summary_score_with_subjects[task] = task_result_with_subjects
 
     with open(os.path.join(args.save_path, "summary.json"), "w") as f:
         json.dump(summary_score, f, indent=4)
+    with open(os.path.join(args.save_path, "summary_of_subjects.json"), "w") as f:
+        json.dump(summary_score_with_subjects, f, indent=4)
     print(json.dumps(summary_score, indent=4))
 
 
@@ -239,13 +229,11 @@ def get_args():
     # generate config
     parser.add_argument("--rounds", type=int, default=1)
     parser.add_argument("--seed", type=int, default=123456)
-    parser.add_argument("--sampling_params", type=str, default=None)
-    parser.add_argument("--refine_prompt", type=str, default="Please further think about and give me a more precise and professional answer.\n")
     parser.add_argument("--use_cpu", action="store_true")
     parser.add_argument("--temperature", type=float, default=None)
     parser.add_argument("--top_p", type=float, default=None)
     parser.add_argument("--top_k", type=int, default=None)
-    parser.add_argument("--max_new_tokens", type=int, default=128)
+    parser.add_argument("--max_new_tokens", type=int, default=160)
 
     args = parser.parse_args()
     return args
diff --git a/lm_cute_eval/run_score.py b/lm_cute_eval/run_score.py
deleted file mode 100644
index f86f273..0000000
--- a/lm_cute_eval/run_score.py
+++ /dev/null
@@ -1,241 +0,0 @@
-import datetime
-import json
-import argparse
-import os
-from collections import defaultdict
-import torch
-from tqdm import tqdm
-from vllm import LLM, SamplingParams
-import os
-import json
-from argparse import Namespace
-from FlagEmbedding import BGEM3FlagModel
-from .model import init_vllm_model
-from .utils import TASK_LIST, MODEL_FORMAT, LOAD_TASK_DATA, MATCH_TASK_ANSWER,TASKS_SUBJECTS
-
-
-
-def load_args_from_config(config_path: str):
-    """
-    从配置文件加载参数。
-
-    :param config_path: 配置文件的路径
-    :return: 包含参数的 Namespace 对象
-    """
-    # 读取配置文件
-    with open(config_path, 'r') as f:
-        config_dict = json.load(f)
-    
-    args = argparse.Namespace()
-
-    # 遍历字典，将值赋给Namespace对象的属性
-    for key, value in config_dict.items():
-        setattr(args, key, value)
-        
-    # init task config
-    if "all" in args.tasks:
-        args.tasks = TASK_LIST
-    
-    with open(args.config_path, "r") as f:
-        args.tasks_config = json.load(f)
-    for task in args.tasks:
-        try:
-            with open(f"code/tasks/{task}/config_{task}.json", "r") as f:
-                default_task_config =  json.load(f)
-            if task not in args.tasks_config:
-                args.tasks_config[task] = default_task_config
-            for k, v in default_task_config.items():
-                if k not in args.tasks_config[task]:
-                    args.tasks_config[task][k] = v
-        except FileNotFoundError:
-            pass
-    if "all" in args.tasks or "mmlu"in args.tasks:
-        if "subjects" not in args.tasks_config["mmlu"]:
-            args.tasks_config["mmlu"]["subjects"] = TASKS_SUBJECTS["mmlu"]
-    return args
-
-def get_tasks_data(args):
-    """
-    return:
-        tasks_data: Dict[task(str), Dict[subject(str), List[item(dict)]]]
-    """
-    tasks_data = defaultdict(list)
-    for task in tqdm(args.tasks, desc="load task data"):
-        tasks_data[task] = LOAD_TASK_DATA[task](args)
-    return tasks_data
-
-
-def run_infer(tasks_data:dict, model:LLM, sampling_params:SamplingParams, args):
-    """
-    params:
-        tasks_data: Dict[task(str), Dict[subject(str), List[item(dict)]]]
-
-    returns:
-        infer_result: dict[task(str), dict[subject(str), item(dict)]]
-    """
-    infer_result = dict(tasks_data)
-    for round_idx in range(1, args.rounds + 1):
-        print(f"running infer round {round_idx}")
-        # get all prompts
-        prompts = []
-        for task in tasks_data:
-            for subject in tasks_data[task]:
-                for item in tasks_data[task][subject]:
-                    if round_idx == 1:
-                        prompt = item["instruction"] + item["fewshot_prompt"] + item["prompt_round1"]
-                    else:                    
-                        history = []
-                        history.append((item[f"prompt_round{1}"], item[f"infer_round{round_idx-1}"]))
-                        query = item[f"prompt_round{round_idx}"]    
-                        prompt = MODEL_FORMAT[args.model_type](query, history)
-                    prompts.append(prompt)
-
-        outputs = model.generate(prompts, sampling_params)
-        generated_texts = [output.outputs[0].text for output in outputs]
-
-        if args.save_infer_results:
-            with open(f"{args.save_path}/infer_round{round_idx}.txt", "w") as f:
-                for x, y in zip(prompts, generated_texts):
-                    print("="*20, file=f)
-                    print(x, file=f)
-                    print("-"*20, file=f)
-                    print(y, file=f)
-        
-        # save infer result in this round
-        cur_infer_idx = 0
-        for task in tasks_data:
-            for subject in tasks_data[task]:
-                for item in tasks_data[task][subject]:
-                    item[f"infer_round{round_idx}"] = generated_texts[cur_infer_idx]
-                    cur_infer_idx += 1
-
-        # prepare prompt for next round
-        if round_idx == args.rounds:
-            break
-        for task in tasks_data:
-            for subject in tasks_data[task]:
-                for item in tasks_data[task][subject]:
-                    item[f"prompt_round{round_idx + 1}"] = args.refine_prompt
-    
-    return infer_result
-
-
-def run_eval(infer_results, args):
-    result = defaultdict(dict)
-    for round_idx in range(1, args.rounds + 1):
-        result[f"round{round_idx}"] = {}
-        print(args.tasks)
-        if "xsum" in args.tasks:
-            torch.cuda.empty_cache()
-            model = BGEM3FlagModel('/data1/dcy/downloads/model/BAAI/bge-m3', use_fp16=True)
-            args.model = model
-        for task in args.tasks:
-            print(task)
-            result[f"round{round_idx}"][task] = MATCH_TASK_ANSWER[task](infer_results[task], round_idx, args)
-    return result
-
-def save_result_inference(infer_result:dict, args):
-    """
-        infer_result: dict[task(str), dict[subject(str), item(dict)]]
-        score: dict[round{i}(str), dict[task(str), dict[subject(str), item(dict)]]]
-    """
-    # save infer results in file
-    if args.save_infer_results:
-        infer_result_path = os.path.join(args.save_path, "infer_results_withoutscore")
-        os.makedirs(infer_result_path, exist_ok=True)
-        for task in infer_result:
-            task_path = os.path.join(infer_result_path, task)
-            os.makedirs(task_path, exist_ok=True)
-            for subject in infer_result[task]:
-                subject_filename = os.path.join(task_path, f"{subject}.json")
-                with open(subject_filename, "w") as f:
-                    json.dump(infer_result[task][subject], f, ensure_ascii=False, indent=4)
-
-
-
-def save_result(infer_result:dict, score:dict, args):
-    """
-        infer_result: dict[task(str), dict[subject(str), item(dict)]]
-        score: dict[round{i}(str), dict[task(str), dict[subject(str), item(dict)]]]
-    """
-    # save infer results in file
-    print("save infer results in file")
-    if args.save_infer_results:
-        infer_result_path = os.path.join(args.save_path, "infer_results")
-        os.makedirs(infer_result_path, exist_ok=True)
-        for task in infer_result:
-            print("task")
-            task_path = os.path.join(infer_result_path, task)
-            os.makedirs(task_path, exist_ok=True)
-            for subject in infer_result[task]:
-                subject_filename = os.path.join(task_path, f"{subject}.json")
-                with open(subject_filename, "w") as f:
-                    json.dump(infer_result[task][subject], f, ensure_ascii=False, indent=4)
-
-    # save evaluation result
-    summary_score = {}
-    for task in tqdm(args.tasks, desc="save evaluation result"):
-        for subject in score["round1"][task].keys():
-            subject_result_path = os.path.join(args.save_path, "eval_result", task)
-            subject_result = {}
-            for round_idx in range(1, args.rounds + 1):
-                subject_result[f"round{round_idx}"] = score[f"round{round_idx}"][task][subject]
-            os.makedirs(subject_result_path, exist_ok=True)
-            fn = os.path.join(subject_result_path, f"{subject}.json")
-            with open(fn, "w") as f:
-                json.dump(subject_result, f, indent=4)
-        if args.rounds == 1:
-            task_result = score[f"round1"][task][task]
-        else:
-            task_result = {f"round{round_idx}": score[f"round{round_idx}"][task][task] for round_idx in range(1, args.rounds + 1)}
-        summary_score[task] = task_result
-
-    with open(os.path.join(args.save_path, "summary.json"), "w") as f:
-        json.dump(summary_score, f, indent=4)
-    print(json.dumps(summary_score, indent=4))
-
-def load_inference_results(infer_result_path: str):
-    """
-    从指定目录加载推断结果到字典中。
-
-    :param infer_result_path: 包含推断结果文件的目录路径
-    :return: 包含推断结果的字典
-    """
-    infer_result = {}
-    # 遍历目录中的所有文件和文件夹
-    for root, dirs, files in os.walk(infer_result_path):
-        for file in files:
-            # 构建文件的完整路径
-            file_path = os.path.join(root, file)
-            # 检查文件是否是JSON文件
-            if file.endswith('.json'):
-                # 从文件名中提取任务和主题
-                task = os.path.basename(root)
-                subject = os.path.splitext(file)[0]
-                # 打开并读取JSON文件
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    item = json.load(f)
-                    # 将读取的数据添加到字典中
-                    if task not in infer_result:
-                        infer_result[task] = {}
-                    infer_result[task][subject] = item
-    return infer_result
-
-
-def main():
-    load_path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/7-25_13:09_Llama-3_dpo_1"
-    # load_path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/5-25_02:21_llama3_gen"
-    # load_path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/5-25_02:38_llama3_gen"
-    # load_path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/5-25_02:39_llama3_gen"
-    config_path = os.path.join(load_path, "config.json")
-    result_path = os.path.join(load_path, "infer_results_withoutscore")
-    args = load_args_from_config(config_path)
-    print(args)
-    inference_result = load_inference_results(result_path)
-    score = run_eval(inference_result, args)
-    save_result(inference_result, score, args)
-    
-
-if __name__ == "__main__":
-    torch.cuda.empty_cache()
-    main()
diff --git a/lm_cute_eval/tasks/drop/config_drop.json b/lm_cute_eval/tasks/drop/config_drop.json
index 3841015..6dc8acc 100644
--- a/lm_cute_eval/tasks/drop/config_drop.json
+++ b/lm_cute_eval/tasks/drop/config_drop.json
@@ -1,4 +1,5 @@
 {
+    "instruction": "You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.Think step by step, then write a line of the form 'Answer: $ANSWER' at the end of your response.",
     "num_fewshot": 0,
     "limit": null
 }
\ No newline at end of file
diff --git a/lm_cute_eval/tasks/drop/load_data_drop.py b/lm_cute_eval/tasks/drop/load_data_drop.py
index a0996f6..a93eed2 100644
--- a/lm_cute_eval/tasks/drop/load_data_drop.py
+++ b/lm_cute_eval/tasks/drop/load_data_drop.py
@@ -1,29 +1,19 @@
 import os, json
 
-drop_dir = os.path.join("data", "tasks", "drop")
-drop_instruction = "You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.Think step by step, then write a line of the form 'Answer: $ANSWER' at the end of your response."
-
 
 def load_file_drop(fn, limit=0):
     data = []
     with open(fn, "r", encoding='utf-8') as f:
-        
         for line in f:
-            try:
-                # 尝试解析JSON
-                data.append(json.loads(line))
-            except json.JSONDecodeError as e:
-                # 如果解析失败，打印错误信息并跳过当前行
-                print(f"Skipping line with JSONDecodeError: {e}")
-                continue  # 继续读取下一行
+            data.append(json.loads(line))
             if limit and len(data) >= limit:
                 break
     return data
 
 
-def get_fewshot_cot_prompt_drop(num_fewshot):
+def get_fewshot_cot_prompt_drop(drop_path, num_fewshot):
     assert 0 <= num_fewshot <= 8
-    fewshot_cot_fn = os.path.join(drop_dir, "fewshot.txt")
+    fewshot_cot_fn = os.path.join(drop_path, "fewshot.txt")
     file_str = ""
     with open(fewshot_cot_fn, "r") as f:
         for line in f:
@@ -36,15 +26,16 @@ def get_fewshot_cot_prompt_drop(num_fewshot):
 
 
 def load_data_drop(args):
+    drop_path = os.path.join(args.data_path, "tasks", "drop")
     task_config = args.tasks_config["drop"]
-    test_data = load_file_drop(os.path.join(drop_dir, "test.jsonl"), task_config["limit"])
+    test_data = load_file_drop(os.path.join(drop_path, "test.jsonl"), task_config["limit"])
     task_data = {"drop": []}
-    fewshot_prompt = get_fewshot_cot_prompt_drop(task_config["num_fewshot"])
+    fewshot_prompt = get_fewshot_cot_prompt_drop(drop_path, task_config["num_fewshot"])
     for item in test_data:
         prompt = "Question: " + item["question"] + "\nAnswer: Let's think step by step\n"
         task_data["drop"].append({
             **item, 
-            "instruction": drop_instruction,
+            "instruction": task_config["instruction"],
             "fewshot_prompt": fewshot_prompt,
             "prompt_round1": prompt,
         })
diff --git a/lm_cute_eval/tasks/drop/match_answer_drop.py b/lm_cute_eval/tasks/drop/match_answer_drop.py
index c37349a..553d682 100644
--- a/lm_cute_eval/tasks/drop/match_answer_drop.py
+++ b/lm_cute_eval/tasks/drop/match_answer_drop.py
@@ -2,18 +2,10 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import re
 from scipy.optimize import linear_sum_assignment
-import string, numpy as np
+import string
+import numpy as np
 EXCLUDE = set(string.punctuation)
 
-
-drop_data_pattern = [
-        '\s*answer is\s*([A-Za-z]+)\s*',
-        '\s*answer is\s*(\d+\.?\d*)',
-        '\s*(\d+\.?\d*)\s*',
-        '\s*([A-Za-z]+)\s*',
-    ]
-
-
 def normalize(s: str) -> str:
     """Lower text and remove punctuation, articles and extra whitespace."""
     s = s.lower()
@@ -26,36 +18,35 @@ def normalize(s: str) -> str:
 
 
 def match_answer_drop(infer_result, round_idx, args):
-    exact_match_cnt = 0
-    result = {}
+    drop_answer_patterns = [
+        '\s*answer is\s*([A-Za-z]+)\s*',
+        '\s*answer is\s*(\d+\.?\d*)',
+        '\s*(\d+\.?\d*)\s*',
+        '\s*([A-Za-z]+)\s*',
+    ]    
+    correct_cnt = 0
     for item in infer_result["drop"]:
-        answer = []
-        norm_ref_answer = normalize(item["answer"])
-        answer_asnwer = re.split(r' ', norm_ref_answer)
-        norm_ref_text = normalize(item["ref_text"])
-        answer_text = re.split(r'[|]\s*|\s+', norm_ref_text)
-        answer.extend(answer_asnwer)
-        answer.extend(answer_text)
+        probable_answers = []
+        probable_answers.extend(normalize(item["answer"]).split())
+        probable_answers.extend(re.split(r'[|]\s*|\s+', normalize(item["ref_text"])))
         norm_answer_item = normalize(item[f"infer_round{round_idx}"])
-        for pa in drop_data_pattern:
-            exact_answer = re.findall(pa, norm_answer_item)
-            if exact_answer:
-                break  
-        item[f"judge{round_idx}"] = False
-        if len(exact_answer) > 0:
-            model_answer = exact_answer[0].split(' ')
-            flag = 0
-            for ans1 in model_answer:
-                for ans2 in answer:
-                    if ans1 == ans2:
-                        item[f"exact_match{round_idx}"] = ans2
-                        exact_match_cnt += 1
-                        item[f"judge{round_idx}"] = True
-                        flag = 1
-                        break
-                if flag == 1:
+        extracted_answers = []
+        for pattern in drop_answer_patterns:
+            extracted_answers.extend(re.findall(pattern, norm_answer_item))
+        extracted_answers = list(set(extracted_answers))
+        item[f"extracted_answer_round{round_idx}"] = extracted_answers
+        item[f"judge_round{round_idx}"] = False
+        for extracted_answer in extracted_answers:
+            for word in extracted_answer.split():
+                if word in probable_answers:
+                    correct_cnt += 1
+                    item[f"judge_round{round_idx}"] = True
                     break
-    result["drop"] = {
-        "exact_match": exact_match_cnt / len(infer_result["drop"]),
+            if item[f"judge_round{round_idx}"]:
+                break
+    result = {
+        "drop": {
+            "acc": correct_cnt / len(infer_result["drop"]),
+        }
     }
     return result
\ No newline at end of file
diff --git a/lm_cute_eval/tasks/gsm8k/config_gsm8k.json b/lm_cute_eval/tasks/gsm8k/config_gsm8k.json
index fb53af6..27fab0c 100644
--- a/lm_cute_eval/tasks/gsm8k/config_gsm8k.json
+++ b/lm_cute_eval/tasks/gsm8k/config_gsm8k.json
@@ -1,4 +1,5 @@
 {
+    "instruction": "Solve the following math questions. Please think step by step and finally give the answer.\n\n",
     "num_fewshot": 8,
     "limit": null
 }
\ No newline at end of file
diff --git a/lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py b/lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py
index aac2576..5dcc374 100644
--- a/lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py
+++ b/lm_cute_eval/tasks/gsm8k/load_data_gsm8k.py
@@ -37,18 +37,16 @@ def get_fewshot_cot_prompt_gsm8k(gsm8k_dir, num_fewshot):
 
 
 def load_data_gsm8k(args):
-    gsm8k_dir = os.path.join("data", "tasks", "gsm8k")
-    gsm8k_instruction = "Solve the following math questions. Please think step by step and finally give the answer.\n\n"
-
+    gsm8k_path = os.path.join(args.data_path, "tasks", "gsm8k")
     task_config = args.tasks_config["gsm8k"]
-    test_data = load_file_gsm8k(os.path.join(gsm8k_dir, "test.jsonl"), task_config["limit"])
+    test_data = load_file_gsm8k(os.path.join(gsm8k_path, "test.jsonl"), task_config["limit"])
     task_data = {"gsm8k": []}
-    fewshot_prompt = get_fewshot_cot_prompt_gsm8k(gsm8k_dir, task_config["num_fewshot"])
+    fewshot_prompt = get_fewshot_cot_prompt_gsm8k(gsm8k_path, task_config["num_fewshot"])
     for item in test_data:
         prompt = "Question: " + item["question"] + "\nAnswer: Let's think step by step\n"
         task_data["gsm8k"].append({
             **item, 
-            "instruction": gsm8k_instruction,
+            "instruction": task_config["instruction"],
             "fewshot_prompt": fewshot_prompt,
             "prompt_round1": prompt,
         })
diff --git a/lm_cute_eval/tasks/gsm8k/match_answer_gsm8k.py b/lm_cute_eval/tasks/gsm8k/match_answer_gsm8k.py
index 9799b8f..e8b7e04 100644
--- a/lm_cute_eval/tasks/gsm8k/match_answer_gsm8k.py
+++ b/lm_cute_eval/tasks/gsm8k/match_answer_gsm8k.py
@@ -1,27 +1,24 @@
 import re
 
-"""
-exact_match: Match answer after 'The answer is #### '
-flexible_match: Match every number in the response, if any number equals to the answer, the answer is correct.
-"""
+# exact_match: Match answer after 'The answer is #### '
+# flexible_match: Match every number in the response, if any number equals to the answer, the answer is correct.
 
-number_pattern = r'(-?\d+(?:,\d{3})*(?:\.\d+)?)'
-gsm8k_data_pattern = "#### " + number_pattern
-exact_pattern = r'The answer is[:\s#\$]*\s*' + number_pattern
-flexible_pattern = number_pattern
 
 def str_to_float(text: str):
-    """convert string like '1,234.00' to float"""
+    # convert string like '1,234.00' to float
     return float(text.replace(",", ""))
 
 
 def match_answer_gsm8k(infer_result, round_idx, args):
+    number_pattern = r'(-?\d+(?:,\d{3})*(?:\.\d+)?)'
+    gsm8k_data_pattern = "#### " + number_pattern
+    exact_pattern = r'The answer is[:\s#\$]*\s*' + number_pattern
+    flexible_pattern = number_pattern
     exact_match_cnt = 0
     flexible_match_cnt = 0 
     result = {}
     for item in infer_result["gsm8k"]:
         answer = str_to_float(re.findall(gsm8k_data_pattern, item["answer"])[0])
-               
         # match answer after 'The answer is #### '
         exact_answer = re.findall(exact_pattern, item[f"infer_round{round_idx}"])
         item[f"judge{round_idx}"] = False
diff --git a/lm_cute_eval/tasks/hellaswag/config_hellaswag.json b/lm_cute_eval/tasks/hellaswag/config_hellaswag.json
index 1bf0057..fc64534 100644
--- a/lm_cute_eval/tasks/hellaswag/config_hellaswag.json
+++ b/lm_cute_eval/tasks/hellaswag/config_hellaswag.json
@@ -1,4 +1,6 @@
 {
-    "num_fewshot": 7,
+    "instruction": "Here are some multiple-choice questions about continuation writing. Each question contains a paragraph and four options for possible continuations. Choose the most appropriate continuation from options A, B, C, and D.\n\n\n",
+    "question_template": "Question: {question}\nOptions:\n(A) {A}\n(B) {B}\n(C) {C}\n(D) {D}\nAnswer: The most appropriate continuation is ",
+    "num_fewshot": 4,
     "limit": null
 }
\ No newline at end of file
diff --git a/lm_cute_eval/tasks/hellaswag/load_data_hellaswag.py b/lm_cute_eval/tasks/hellaswag/load_data_hellaswag.py
index 928e654..fa32495 100644
--- a/lm_cute_eval/tasks/hellaswag/load_data_hellaswag.py
+++ b/lm_cute_eval/tasks/hellaswag/load_data_hellaswag.py
@@ -1,9 +1,7 @@
 import os, json
 
 
-def format_query_hellaswag(data, has_answer):
-    question_template = "Question: {question}\nOptions:\n(A) {A}\n(B) {B}\n(C) {C}\n(D) {D}\nAnswer: The most appropriate continuation is "
-
+def format_query_hellaswag(question_template, data, has_answer):
     prompt = question_template.format(
         question=data["Q"],
         A=data["A"],
@@ -32,32 +30,30 @@ def load_file_hellaswag(fn, limit=None):
     return data
 
 
-def get_fewshot_prompt_hellaswag(hellaswag_dir, num_fewshot):
+def get_fewshot_prompt_hellaswag(hellaswag_path, question_template, num_fewshot):
     assert 0 <= num_fewshot <= 25
     fewshot_prompt = ""
-    fewshot_fn = os.path.join(hellaswag_dir, "hellaswag_train_sampled25.jsonl")
+    fewshot_fn = os.path.join(hellaswag_path, "hellaswag_train_sampled25.jsonl")
     fewshot_data = load_file_hellaswag(fewshot_fn, num_fewshot)
     for item in fewshot_data:
-        fewshot_prompt += format_query_hellaswag(item, True)
-        
+        fewshot_prompt += format_query_hellaswag(question_template, item, True)
     return fewshot_prompt
 
 
 def load_data_hellaswag(args):
-    hellaswag_dir = os.path.join(args.data_path, "tasks", "hellaswag")
-    hellaswag_instruction = "Here are some multiple-choice questions about continuation writing. Each question contains a paragraph and four options for possible continuations. Choose the most appropriate continuation from options A, B, C, and D.\n\n\n"
-
+    hellaswag_path = os.path.join(args.data_path, "tasks", "hellaswag")
     task_config = args.tasks_config["hellaswag"]
+    question_template = task_config["question_template"]    
     task_data = {}
-    test_fn = os.path.join(hellaswag_dir, "hellaswag.jsonl")
+    test_fn = os.path.join(hellaswag_path, "hellaswag.jsonl")
     test_data = load_file_hellaswag(test_fn, task_config["limit"])
-    fewshot_prompt = get_fewshot_prompt_hellaswag(hellaswag_dir, task_config["num_fewshot"])
+    fewshot_prompt = get_fewshot_prompt_hellaswag(hellaswag_path, question_template, task_config["num_fewshot"])
     data = []
     for item in test_data:
-        prompt = format_query_hellaswag(item, False)
+        prompt = format_query_hellaswag(question_template, item, False)
         data.append({
             **item,
-            "instruction": hellaswag_instruction,
+            "instruction": task_config["instruction"],
             "fewshot_prompt": fewshot_prompt,
             "prompt_round1": prompt,
         })
diff --git a/lm_cute_eval/tasks/humaneval/config_humaneval.json b/lm_cute_eval/tasks/humaneval/config_humaneval.json
index afb04f8..5320457 100644
--- a/lm_cute_eval/tasks/humaneval/config_humaneval.json
+++ b/lm_cute_eval/tasks/humaneval/config_humaneval.json
@@ -1,3 +1,4 @@
 {
+    "instruction": "Please complete the following python functions and output the entire function within a python code block, without any explainations.\n\n\n",
     "limit": null   
 }
\ No newline at end of file
diff --git a/lm_cute_eval/tasks/humaneval/load_data_humaneval.py b/lm_cute_eval/tasks/humaneval/load_data_humaneval.py
index d874c50..3a6408c 100644
--- a/lm_cute_eval/tasks/humaneval/load_data_humaneval.py
+++ b/lm_cute_eval/tasks/humaneval/load_data_humaneval.py
@@ -2,8 +2,8 @@
 from .human_eval.data import read_problems
 
 
-def get_fewshot_prompt(humaneval_dir):
-    fewshot_fn = os.path.join(humaneval_dir, "fewshot_prompt.txt")
+def get_fewshot_prompt(humaneval_path):
+    fewshot_fn = os.path.join(humaneval_path, "fewshot_prompt.txt")
     fewshot_prompt = ""
     with open(fewshot_fn, "r") as f:
         for line in f:
@@ -16,16 +16,15 @@ def format_humaneval_prompt(question:str):
 
 
 def load_data_humaneval(args):
-    humaneval_instruction = "Please complete the following python functions and output the entire function within a python code block, without any explainations.\n\n\n"
-    humaneval_dir = os.path.join(args.data_path, "tasks", "humaneval")
+    humaneval_path = os.path.join(args.data_path, "tasks", "humaneval")
     task_config = args.tasks_config["humaneval"]
-    data = read_problems(os.path.join(humaneval_dir, "HumanEval.jsonl.gz"))
+    data = read_problems(os.path.join(humaneval_path, "HumanEval.jsonl.gz"))
     task_data = {"humaneval": []}
-    fewshot_prompt = get_fewshot_prompt(humaneval_dir)
+    fewshot_prompt = get_fewshot_prompt(humaneval_path)
     for humaneval_id, item in data.items():
         task_data["humaneval"].append({
             **item, 
-            "instruction": humaneval_instruction,
+            "instruction": task_config["instruction"],
             "fewshot_prompt": fewshot_prompt,
             "prompt_round1": format_humaneval_prompt(item["prompt"])
         })
diff --git a/lm_cute_eval/tasks/icleval/match_answer_icleval.py b/lm_cute_eval/tasks/icleval/match_answer_icleval.py
index 4b075d8..bf9192f 100644
--- a/lm_cute_eval/tasks/icleval/match_answer_icleval.py
+++ b/lm_cute_eval/tasks/icleval/match_answer_icleval.py
@@ -5,6 +5,7 @@ def match_answer_icleval(infer_result:dict, round_idx:int, args):
         subject_correct_cnt = 0
         total_cnt += len(subject_result)
         for item in subject_result:
+            item[f"judge_round{round_idx}"] = False
             ans = str(item["label"]).strip()
             if subject == "generate_output_format":
                 ans.replace("value", "key")
@@ -13,6 +14,7 @@ def match_answer_icleval(infer_result:dict, round_idx:int, args):
                 else:
                     ans.replace("key", item["ans_content"])
             if ans in item[f"infer_round{round_idx}"]:
+                item[f"judge_round{round_idx}"] = True
                 subject_correct_cnt += 1
         result[subject] = {
             "acc": subject_correct_cnt / len(subject_result)
diff --git a/lm_cute_eval/tasks/mmlu/match_answer_mmlu.py b/lm_cute_eval/tasks/mmlu/match_answer_mmlu.py
index 4ad2441..9b31804 100644
--- a/lm_cute_eval/tasks/mmlu/match_answer_mmlu.py
+++ b/lm_cute_eval/tasks/mmlu/match_answer_mmlu.py
@@ -1,33 +1,31 @@
 import re
 from ..match_answer import find_first_selection
-mmlu_pattern = r"The\s+answer\s+is\s+[\(\[\{]*([ABCD])[\)\]\}]*\.?"
+
 
 def match_answer_mmlu(infer_result:dict, round_idx, args):
+    # mmlu_pattern = r"The\s+answer\s+is\s+[\(\[\{]*([ABCD])[\)\]\}]*\.?"
     task_config = args.tasks_config["mmlu"]
     result = {}
     for subject in task_config["subjects"]:
         correct_cnt = 0
-        total_cnt = 0
         for item in infer_result[subject]:
-            l = re.findall(mmlu_pattern, item[f"infer_round{round_idx}"])
-            if len(l) > 0:
-                model_answer = l[0][0]
-            else:
-                model_answer = find_first_selection(item[f"infer_round{round_idx}"])
+            # l = re.search(mmlu_pattern, item[f"infer_round{round_idx}"])
+            # if len(l) > 0:
+            #     model_answer = l[0]
+            # else:
+            #     model_answer = find_first_selection(item[f"infer_round{round_idx}"])
+            model_answer = find_first_selection(item[f"infer_round{round_idx}"])
             item[f"extract_answer_round{round_idx}"] = model_answer
-            item[f"judge{round_idx}"] = False
-            if not item[f"extract_answer_round{round_idx}"]:
-                continue
-            total_cnt+=1
+            item[f"judge_round{round_idx}"] = False
             if model_answer == item["ans"]:
                 correct_cnt += 1
-                item[f"judge{round_idx}"] = True
+                item[f"judge_round{round_idx}"] = True
                 
-        subject_result = correct_cnt / total_cnt
+        subject_result = correct_cnt / len(infer_result[subject])
         result[subject] = {
             "acc": subject_result,
             "correct_cnt": correct_cnt,
-            "tot_cnt": total_cnt
+            "tot_cnt": len(infer_result[subject])
         }
 
     result["mmlu"] = {
diff --git a/lm_cute_eval/tasks/xsum/config_xsum.json b/lm_cute_eval/tasks/xsum/config_xsum.json
index cfa6ec4..354994a 100644
--- a/lm_cute_eval/tasks/xsum/config_xsum.json
+++ b/lm_cute_eval/tasks/xsum/config_xsum.json
@@ -1,4 +1,5 @@
 {
+    "instruction": "You will be asked to read a dialog and summary the dialog. Some examples of dialogs and summaries are provided below.Think step by step, then write a summary of the form 'Answer: $ANSWER' at the end of your response.",
     "num_fewshot": 1,
     "limit": null
 }
\ No newline at end of file
diff --git a/lm_cute_eval/tasks/xsum/load_data_xsum.py b/lm_cute_eval/tasks/xsum/load_data_xsum.py
index 796301a..7d5319f 100644
--- a/lm_cute_eval/tasks/xsum/load_data_xsum.py
+++ b/lm_cute_eval/tasks/xsum/load_data_xsum.py
@@ -1,28 +1,19 @@
 import os, json
 
 
-xsum_dir = os.path.join("data", "tasks", "xsum")
-xsum_instruction = "You will be asked to read a dialog and summary the dialog. Some examples of dialogs and summaries are provided below.Think step by step, then write a summary of the form 'Answer: $ANSWER' at the end of your response."
-
-
 def load_file_xsum(fn, limit=0):
     data = []
     with open(fn, "r", encoding='utf-8') as f:
-        
         for line in f:
-            try:
-                data.append(json.loads(line))
-            except json.JSONDecodeError as e:
-                print(f"Skipping line with JSONDecodeError: {e}")
-                continue
+            data.append(json.loads(line))
             if limit and len(data) >= limit:
                 break
     return data
 
 
-def get_fewshot_cot_prompt_xsum(num_fewshot):
+def get_fewshot_cot_prompt_xsum(xsum_path, num_fewshot):
     assert 0 <= num_fewshot <= 8
-    fewshot_cot_fn = os.path.join(xsum_dir, "fewshot.txt")
+    fewshot_cot_fn = os.path.join(xsum_path, "fewshot.txt")
     file_str = ""
     with open(fewshot_cot_fn, "r") as f:
         for line in f:
@@ -35,15 +26,16 @@ def get_fewshot_cot_prompt_xsum(num_fewshot):
 
 
 def load_data_xsum(args):
+    xsum_path = os.path.join(args.data_path, "tasks", "xsum")
     task_config = args.tasks_config["xsum"]
-    test_data = load_file_xsum(os.path.join(xsum_dir, "test.jsonl"), task_config["limit"])
+    test_data = load_file_xsum(os.path.join(xsum_path, "test.jsonl"), task_config["limit"])
     task_data = {"xsum": []}
-    fewshot_prompt = get_fewshot_cot_prompt_xsum(task_config["num_fewshot"])
+    fewshot_prompt = get_fewshot_cot_prompt_xsum(xsum_path, task_config["num_fewshot"])
     for item in test_data:
         prompt = "Question: " + item["question"] + "\nAnswer:"
         task_data["xsum"].append({
             **item, 
-            "instruction": xsum_instruction,
+            "instruction": task_config["instruction"],
             "fewshot_prompt": fewshot_prompt,
             "prompt_round1": prompt,
         })
diff --git a/lm_cute_eval/tasks/xsum/match_answer_xsum.py b/lm_cute_eval/tasks/xsum/match_answer_xsum.py
index 1382860..b67195c 100644
--- a/lm_cute_eval/tasks/xsum/match_answer_xsum.py
+++ b/lm_cute_eval/tasks/xsum/match_answer_xsum.py
@@ -1,78 +1,21 @@
-
-# '''
-# # {
-# #   'colbert': [0.7796499729156494, 0.4621465802192688, 0.4523794651031494, 0.7898575067520142], 
-# #   'sparse': [0.195556640625, 0.00879669189453125, 0.0, 0.1802978515625], 
-# #   'dense': [0.6259765625, 0.347412109375, 0.349853515625, 0.67822265625], 
-# #   'sparse+dense': [0.482503205537796, 0.23454029858112335, 0.2332356721162796, 0.5122477412223816], 
-# #   'colbert+sparse+dense': [0.6013619303703308, 0.3255828022956848, 0.32089319825172424, 0.6232916116714478]
-# # }
-# '''
 # from FlagEmbedding import BGEM3FlagModel
-def calc_similarity(text1:str, text2:str):
-    
-    text1 = [text1]
-    text2 = [text2]
-    text1_ids = model.encode(text1, batch_size=256, max_length=1024, )['dense_vecs']
-    text2_ids = model.encode(text2, batch_size=256, max_length=1024, )['dense_vecs']
-    similarity = text1_ids @ text2_ids.T
-    if similarity > 0.8:
-        return True
-    else:
-        return False
-
-def match_answer_xsum(infer_result, round_idx, args):
-    exact_match_cnt = 0
-    result = {}
-    answer = []
-    exact_answer = []
-    # model = BGEM3FlagModel('/home/admin/workspace/aop_lab/app_source/dcy/download/model/BAAI/bge-m3', use_fp16=True)
-    # for item in infer_result["xsum"]:
-    #     answer.append(item["answer"])
-    #     exact_answer.append(item[f"infer_round{round_idx}"])
-    # text1_ids = model.encode(exact_answer, batch_size=256, max_length=1024, )['dense_vecs']    
-    # text2_ids = model.encode(answer, batch_size=256, max_length=1024, )['dense_vecs']
-    # similarities = text1_ids @ text2_ids.T
-    # for similarity in similarities:
-    #     exact_match_cnt+= similarity
-    #     item[f"exact_match{round_idx}"] = similarity
-        
-    result["xsum"] = {
-        "similarity": (exact_match_cnt / len(infer_result["xsum"])),
-    }
-    return result
 
 
-# import json
-# import os
-# os.environ["CUDA_VISIBLE_DEVICES"] = "6"
-# if __name__ == "__main__":
-#     path = "/data1/dcy/projects/evaluate/lm-cute-eval/output/6-17_17:46_Llama-2-13b-chat-hf/infer_results/xsum/xsum.json"
-    
-#     exact_match_cnt = 0
+# def match_answer_xsum(infer_result, round_idx, args):
+#     similarity_sum = 0
 #     result = {}
-#     answer = []
-#     exact_answer1 = []
-#     exact_answer2 = []
-#     with open(path, "r") as f:
-#         xsum_result = json.load(f)
-#     eval_result = xsum_result
-#     model = BGEM3FlagModel('/data1/dcy/downloads/model/BAAI/bge-m3', use_fp16=True)
-#     for item in xsum_result:
-#         answer.append(item["answer"])
-#         exact_answer1.append(item[f"infer_round1"])
-#         exact_answer2.append(item[f"infer_round2"])
-#     exact_answer1_ids = model.encode(exact_answer1, batch_size=256, max_length=1024, )['dense_vecs']    
-#     exact_answer2_ids = model.encode(exact_answer2, batch_size=256, max_length=1024, )['dense_vecs']    
-#     text2_ids = model.encode(answer, batch_size=256, max_length=1024, )['dense_vecs']
-#     similarities1 = exact_answer1_ids @ text2_ids.T
-#     similarities2 = exact_answer2_ids @ text2_ids.T
-#     for similarity in similarities1:
-#         exact_match_cnt+= similarity
-#         eval_result[f"exact_match1"] = similarity
-#     for similarity in similarities2:
-#         exact_match_cnt+= similarity
-#         eval_result[f"exact_match2"] = similarity
+#     similarity_model = BGEM3FlagModel("/data2/dcy/downloads/model/BAAI/bge-m3", use_fp16=True)
+#     for item in infer_result["xsum"]:
+#         label_ids = similarity_model.encode(item["answer"], batch_size=256, max_length=1024, )['dense_vecs']    
+#         model_answer_ids = similarity_model.encode(item[f"infer_round{round_idx}"], batch_size=256, max_length=1024, )['dense_vecs']
+#         similarity = float(label_ids @ model_answer_ids.T)
+#         similarity_sum += similarity
+#         item[f"similarity_round{round_idx}"] = similarity
         
-        
-        
\ No newline at end of file
+#     result["xsum"] = {
+#         "similarity": similarity_sum / len(infer_result["xsum"]),
+#     }
+#     return result
+
+def match_answer_xsum(infer_result, round_idx, args):
+    return {"xsum": "skipped"}
\ No newline at end of file
diff --git a/run.sh b/run.sh
index 14d7794..1a09019 100755
--- a/run.sh
+++ b/run.sh
@@ -1,4 +1,4 @@
-export CUDA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=3
 export TOKENIZERS_PARALLELISM=false
 
 declare -A models=(
@@ -15,12 +15,17 @@ for model_name in "${!models[@]}"; do
     model_path=${models[$model_name]}
     python main.py \
         --model_path "$model_path" \
-        --model_type hf \
+        --model_type vllm \
         --format_type default \
-        --tasks drop \
+        --tasks all \
         --save_name "$model_name" \
         --save_infer_texts \
-        --config_path config_debug.json \
+        --save_infer_results \
+        --config_path "config_debug.json" \
         --output_path output/debug \
-        --use_cpu
+        --max_new_tokens 180 \
+        --temperature 0.1 \
+        --top_p 0.2 \
+        --top_k 20 \
+
 done