-
Notifications
You must be signed in to change notification settings - Fork 1
/
2_preprocess.py
53 lines (41 loc) · 1.37 KB
/
2_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import argparse
import re
from pathlib import Path
from tqdm import tqdm
def pre_process(text: str):
text = text.replace("-\n", "").replace("’", "'")
text = re.sub("\\d+\\s?\n", "\n", text)
text = re.sub(r"HAS [•\-/].*\n", "\n", text)
text = re.sub(r"\d+\.\d+\.\n", "", text)
return text
parser = argparse.ArgumentParser(description="Pre-processing step")
parser.add_argument(
"input_path",
nargs=1,
help="Specify input extraction or input folder containing extractions",
)
parser.add_argument(
"--corpus_file",
nargs=1,
dest="corpus_file",
default=["full_text.txt"],
help="if the input is a directory name, this parameter gives the name "
"of the text file containing the extracted corpus. Default: full_text.txt",
)
args = parser.parse_args()
if __name__ == "__main__":
input_path = Path(args.input_path[0])
input_files = []
if input_path.is_file():
input_files.append(input_path)
elif files := list(input_path.glob("*.txt")):
input_files.extend(files)
else:
for file in input_path.glob("*"):
input_files.extend(list(file.glob(f"{args.corpus_file[0]}")))
for file in tqdm(input_files):
with open(file, "r") as f:
input_text = f.read()
output_text = pre_process(input_text)
with open(file, "w") as f:
f.write(output_text)