-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_reader.py
39 lines (31 loc) · 1.56 KB
/
file_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import docx # must be python-docx for use with python 3
GARBAGE = ['', ',', '………………………………….', '………………………………………………………..', '……………………………………………………………………………………..',
'………………………………………………………………………………………………………………………………………', 'Red Ridding Hood']
def load_data(filename):
doc = docx.Document(filename)
content = []
for para in doc.paragraphs:
content.append(para.text)
return content
def process_data(data):
cleaned_data = [datum for datum in data if datum not in GARBAGE]
return cleaned_data
def get_text(file_name):
return process_data(load_data(file_name))
if __name__ == "__main__":
results = process_data(load_data("Red Ridding Hood Vandana.docx"))
first = results.pop(1)
results[1] = first + " " + results[1] # the first story is in two paragraphs
results_dict = {}
for i in range(len(results)): # populate dictionary with speaker names as keys, transcripts as values
if i % 2 == 0:
print(results[i])
results_dict[results[i]] = ""
else:
results_dict[results[i-1]] = results[i]
for i, key in enumerate((results_dict)):
name = "_".join(key.lower().split())
output = open((f"romanized_transcripts/{'0' if i < 9 else ''}{i+1}_{name}.txt"), "w")
output.write(results_dict[key])
output.close()
print(len(results))