-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data.py
executable file
·75 lines (61 loc) · 2.59 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
"""A script that cleans and normalizes the data obtained from the CHILDES corpus."""
import argparse
import re
def get_data(path: str) -> list:
"""reads the data into a list of strings"""
list_of_lines = []
with open(path, "r") as source:
for line in source:
if not line:
continue
else:
list_of_lines.append(line.lstrip().rstrip())
return list_of_lines
def part1(data: list) -> list:
"""removes strings starting with '&', '+', '-', or a digit"""
for token in data:
if token.startswith(("&", "+", "-")) or token[0].isdigit():
data.remove(token)
return data
def part4(data: list) -> list:
"""removes the 'special code'"""
for token in data:
if ord(token[0]) == 21 or ord(token[-1]) == 21:
data.remove(token)
return data
def writelines(data: list, path: str) -> None:
"""iterates over a list of strings and outputs them into a file"""
with open(path, "w") as sink:
for item in data:
print(item, file=sink)
def main(args: argparse.Namespace) -> None:
data_to_clean = get_data(args.input)
cleaned_data = []
for item in data_to_clean:
# removes the parentheses but leaves their content
no_parentheses = re.sub(r"[()]", "", item)
# removes the chevrons but leaves their content
no_chevrons = re.sub(r"[<>]", "", no_parentheses)
# removes the brackets and their content
no_brackets = re.sub(r"\[.*?\]", "", no_chevrons)
# removes the '*MOT: ' marker from utterance line
no_MOT_markers = re.sub(r"^.{6}", "", no_brackets)
# splits the line by whitespace to implement custom functions
split = no_MOT_markers.split() # returns a list of strings
first_bullet = part1(split)
fourth_bullet = part4(first_bullet)
# rejoins the utterance by whitespace
concatenated = " ".join(fourth_bullet)
# removes all punctuation except the apostrophes
no_punctuation = re.sub(r"[^\w\d'\s]+",'',concatenated)
# removes duplicated whitespace
single_whitespace = re.sub(r"\s+", " ", no_punctuation)
# appends cleaned utterance to a list
cleaned_data.append(single_whitespace.lstrip().rstrip().casefold())
writelines(cleaned_data, args.output)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input", help="path to the input file")
parser.add_argument("--output", help="path to the output file")
main(parser.parse_args())