-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstep2_topkl.py
59 lines (46 loc) · 1.88 KB
/
step2_topkl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import joblib
from pathlib import Path
import pandas as pd
import neuralcoref
import spacy
nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp, greedyness=0.45)
counter = 0
counter_q = 0
data = {}
for split in ['train', 'val', 'test']:
mapping_path='data/{}.xlsx'.format(split)
mapping = pd.read_excel(mapping_path)
data[split] = {}
unique_section = []
for i in range(mapping.shape[0]):
if mapping.loc[i, 'local-or-sum']=='summary' or not isinstance(mapping.loc[i, 'inference'], str):
continue
section = mapping.loc[i, 'cor_section']
question = mapping.loc[i, 'question']
inference = mapping.loc[i, 'inference']
answer = mapping.loc[i, 'answer1']
question_type = mapping.loc[i, 'attribute1']
book_id = mapping.loc[i, 'book_id']
book_name = mapping.loc[i, 'book_name']
section_id = mapping.loc[i, 'section_id']
section = section.replace('\n', ' ').replace('\r', ' ')
question = question.replace('\n', ' ').replace('\r', ' ')
inference = inference.replace('\n', ' ').replace('\r', ' ')
answer = answer.replace('\n', ' ').replace('\r', ' ')
# doc = nlp(section)._.coref_resolved
# doc = nlp(doc)
# sents = [c.string.strip() for c in doc.sents]
# section = ' '.join(sents)
if section not in unique_section:
unique_section.append(section)
data[split][section] = []
data[split][section].append({'question': question, 'type': question_type, 'inference': inference, 'answer': answer, \
'book_id': book_id, 'book_name': book_name, 'section_id': section_id})
counter_q +=1
print(len(unique_section))
print(counter_q)
counter+=len(unique_section)
print(counter)
print(counter_q)
joblib.dump(data, 'data/data.pkl')