-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
72 lines (59 loc) · 2.67 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import random
import json
from pathlib import Path
from argparse import ArgumentParser
NO_INPUT_PROMPT: str = "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。"
def main():
parser = ArgumentParser()
parser.add_argument("--ichikara-dir", type=str, required=True)
parser.add_argument("--answer-carefully-dir", type=str, required=True)
parser.add_argument("--output-dir", type=str, required=True)
args = parser.parse_args()
ichikara_filenames: list[str] = [
"ichikara-instruction-003-001-1",
"ichikara-instruction-003-003-1",
]
saved_ichikara_samples: list[dict] = []
for ichikara_filename in ichikara_filenames:
ichikara_filepath: Path = Path(f"{args.ichikara_dir}/{ichikara_filename}.json")
print(ichikara_filepath)
with ichikara_filepath.open(mode="r", encoding="utf-8") as f:
loaded_samples = json.load(f)
for loaded_sample in loaded_samples:
saved_ichikara_samples.append(
{
"ID": loaded_sample["ID"],
"messages": [
{"role": "system", "content": NO_INPUT_PROMPT},
{"role": "user", "content": loaded_sample["text"]},
{"role": "assistant", "content": loaded_sample["output"]},
],
}
)
random.seed(42)
random.shuffle(saved_ichikara_samples)
with Path(f"{args.output_dir}/ichikara.jsonl").open("w", encoding="utf-8") as f:
for sample in saved_ichikara_samples:
f.write(json.dumps(sample, ensure_ascii=False) + "\n")
answer_carefully_filepath: Path = Path(f"{args.answer_carefully_dir}/AnswerCarefullyVersion002_Dev.json")
with answer_carefully_filepath.open(mode="r", encoding="utf-8") as f:
loaded_samples = json.load(f)
saved_answer_carefully_samples: list[dict] = []
for loaded_sample in loaded_samples:
saved_answer_carefully_samples.append(
{
"ID": loaded_sample["ID"],
"messages": [
{"role": "system", "content": NO_INPUT_PROMPT},
{"role": "user", "content": loaded_sample["text"]},
{"role": "assistant", "content": loaded_sample["output"]},
],
}
)
random.seed(42)
random.shuffle(saved_answer_carefully_samples)
with Path(f"{args.output_dir}/answer_carefully.jsonl").open("w", encoding="utf-8") as f:
for sample in saved_answer_carefully_samples:
f.write(json.dumps(sample, ensure_ascii=False) + "\n")
if __name__ == "__main__":
main()