forked from bojone/kg-2019
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_trans.py
76 lines (50 loc) · 1.83 KB
/
data_trans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#! -*- coding:utf-8 -*-
import json
from tqdm import tqdm
import codecs
all_50_schemas = set()
with open('all_50_schemas') as f:
for l in tqdm(f):
a = json.loads(l)
all_50_schemas.add(a['predicate'])
id2predicate = {i:j for i,j in enumerate(all_50_schemas)}
predicate2id = {j:i for i,j in id2predicate.items()}
with codecs.open('all_50_schemas_me.json', 'w', encoding='utf-8') as f:
json.dump([id2predicate, predicate2id], f, indent=4, ensure_ascii=False)
chars = {}
min_count = 2
train_data = []
with open('train_data.json') as f:
for l in tqdm(f):
a = json.loads(l)
if not a['spo_list']:
continue
train_data.append(
{
'text': a['text'],
'spo_list': [(i['subject'], i['predicate'], i['object']) for i in a['spo_list']]
}
)
for c in a['text']:
chars[c] = chars.get(c, 0) + 1
with codecs.open('train_data_me.json', 'w', encoding='utf-8') as f:
json.dump(train_data, f, indent=4, ensure_ascii=False)
dev_data = []
with open('dev_data.json') as f:
for l in tqdm(f):
a = json.loads(l)
dev_data.append(
{
'text': a['text'],
'spo_list': [(i['subject'], i['predicate'], i['object']) for i in a['spo_list']]
}
)
for c in a['text']:
chars[c] = chars.get(c, 0) + 1
with codecs.open('dev_data_me.json', 'w', encoding='utf-8') as f:
json.dump(dev_data, f, indent=4, ensure_ascii=False)
with codecs.open('all_chars_me.json', 'w', encoding='utf-8') as f:
chars = {i:j for i,j in chars.items() if j >= min_count}
id2char = {i+2:j for i,j in enumerate(chars)} # padding: 0, unk: 1
char2id = {j:i for i,j in id2char.items()}
json.dump([id2char, char2id], f, indent=4, ensure_ascii=False)