-
Notifications
You must be signed in to change notification settings - Fork 3
/
encode-corpus.py
72 lines (66 loc) · 1.87 KB
/
encode-corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import re
import time
import io
import sys
import os
from collections import defaultdict
# splits tokens on a whitespace, and outputs unique tokens
textCorpus = io.open(sys.argv[1], encoding='utf8', mode='r')
vocabFilename = sys.argv[2]
intCorpus = io.open(sys.argv[3], encoding='utf8', mode='w')
if len(sys.argv) > 4:
vocabStatus = sys.argv[4] # to indicate the vocab file is ready, pass 'ready' here
else:
vocabStatus = 'not ready'
# figures whether the vocabulary is ready
vocabReady = False
if vocabStatus == 'ready':
print 'reusing the vocab file'
vocabReady = True
else:
print 'creating the vocab file'
if vocabReady:
vocabFile = io.open(vocabFilename, encoding='utf8', mode='r')
else:
vocabFile = io.open(vocabFilename, encoding='utf8', mode='w')
# read the vocab if ready
vocab = defaultdict(int)
if vocabReady:
linesCounter = 0
id = 0 # widening the scope of id
for line in vocabFile:
splits = line.strip().split()
if len(splits) == 0:
continue
elif len(splits) != 2:
print 'vocab file is malformatted at line #{0}'.format(linesCounter)
exit()
(id, token) = splits
vocab[token] = id
linesCounter+= 1
nextId = int(id) + 1
else:
# id 0 is reserved for openfst epsilon
# id 1 is reserved for null alignments
nextId = 2
# read the corpus
linesCounter = 0
for line in textCorpus:
temp = []
tokens = line.strip().split()
for token in tokens:
if token not in vocab.keys():
vocab[token] = nextId
nextId += 1
if not vocabReady:
vocabFile.write(u'{0} {1}\n'.format(nextId, token))
temp.append(str(vocab[token]))
intCorpus.write(u'{0}\n'.format(' '.join(temp)))
# for logging only
linesCounter += 1
if linesCounter % 1000 == 0:
print 'nextId={0}'.format(nextId)
print 'linesCounter={0}'.format(linesCounter)
vocabFile.close()
textCorpus.close()
intCorpus.close()