forked from theeluwin/pytorch-sgns
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare.py
executable file
·51 lines (39 loc) · 1.53 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import requests
import os
import pickle
# Change here. todo: build CLI
for ud_lang in ['de', 'en', 'it']:
ud_mode = 'train' # 'test' # 'dev' #
comm = {'de': None, 'en': None, 'it':'#'}
if not os.path.exists('data'):
os.mkdir('data')
if not os.path.exists(f'data/{ud_lang}'):
os.mkdir(f'data/{ud_lang}')
sents = []
filename = f'{ud_lang}-ud-{ud_mode}.conllu'
try:
data = np.loadtxt(f'data/{ud_lang}/' + filename, dtype=str, delimiter='\t', comments=comm[ud_lang])
except OSError:
r = requests.get(f'https://raw.githubusercontent.com/ufal/rh_nntagging/master/data/ud-1.2/{ud_lang}/'+filename)
with open(f'data/{ud_lang}/'+filename, 'w') as f:
f.write(r.text)
data = np.loadtxt(f'data/{ud_lang}/' + filename, dtype=str, delimiter='\t', comments=comm[ud_lang])
curr = []
word2class = {}
for w in data:
if w[0] == '1':
if len(curr) != 0:
sents.append(curr)
curr = []
# the unknown character (I'm using |) may not be in corpus.
if '|' not in w[1] and (len(w[1]) == 1 or any(c.isalnum() for c in w[1])):
curr.append(w[1])
word2class[w[1]] = w[3]
print('got corpus with sentences: ', len(sents))
with open(f'data/{ud_lang}/corpus_{ud_mode}.txt', 'w') as f:
for s in sents:
f.write(' '.join(s) + '\n')
pickle.dump(word2class, open(f'data/{ud_lang}/word2class.dat', 'wb'))