-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexplore_we_model.py
67 lines (46 loc) · 1.54 KB
/
explore_we_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import fastText
import pdb
model_name = 'hr'
path = '../fastText/{}.bin'.format(model_name)
model = fastText.load_model(path)
vocab = model.get_words()
word = 'sexual_assault'
if word in vocab:
print('word {} in vocab'.format(word))
def average_subwords(word, plot=0):
subwords = model.get_subwords(word)[0]
representations = []
for subword in subwords:
representations.append(model.get_word_vector(subword))
matrix = np.array(representations)
if plot:
df = pd.DataFrame(np.swapaxes(matrix, 0, 1), columns=subwords)
plot_heatmap(df)
return np.average(matrix, axis=0)
def plot_heatmap(df, num=1):
f, ax = plt.subplots(num)
sns.heatmap(df, ax=ax)
plt.show()
def query_word(word, matrix, model):
idx = int(model.get_word_id(word))
return matrix[idx]
avg_vec = average_subwords(word, 1)
vec = model.get_word_vector(word)
matrix = model.get_output_matrix()
pca = decomposition.PCA(n_components=5)
x_std = StandardScaler().fit_transform(matrix)
pca_matrix = pca.fit_transform(x_std)
keywords = ['rape', 'raped', 'stepfather', 'robbery']
for ind, mat in enumerate([matrix, pca_matrix]):
representations = [query_word(word, mat, model) for word in keywords]
df = pd.DataFrame(np.swapaxes(np.array(representations), 0, 1), columns=keywords)
plot_heatmap(df)
print(avg_vec[:10])
print(vec[:10])
pdb.set_trace()