-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathload_data_util.py
63 lines (48 loc) · 1.46 KB
/
load_data_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import nltk
import collections
import os
import pickle
import sys
import numpy as np
def Idx2Word(doc_idx, vocabulary):
words = []
for idx in doc_idx:
if idx == 0:
break
words.append(vocabulary[idx])
return words
def Path2Sentence(file_path):
file = open(file_path)
sentences = file.read().split('\n')
sentences = sentences[0:-1]
return sentences
def BuildVocabulary(X):
max_sentence_len = 0
word_frequency = collections.Counter()
for line in X:
words = nltk.word_tokenize(line.lower())
if len(words) > max_sentence_len:
max_sentence_len = len(words)
for word in words:
word_frequency[word] += 1
return word_frequency, max_sentence_len
def Sentence2Index(X, word2index):
Xout = []
for line in X:
words = nltk.word_tokenize(line.lower())
sequence = []
for word in words:
if word in word2index:
sequence.append(word2index[word])
else:
sequence.append(word2index["<UNK>"])
Xout.append(sequence)
return Xout
def LoadPretrainedEmbeddings(file_path):
pretrained_fpath_saved = os.path.expanduser(file_path.format(sys.version_info.major))
with open(pretrained_fpath_saved, 'rb') as f:
embedding_weights = pickle.load(f)
f.close()
out = np.array(list(embedding_weights.values()))
print('embedding_weights shape:', out.shape)
return out