-
Notifications
You must be signed in to change notification settings - Fork 0
/
baseline.py
121 lines (94 loc) · 4.55 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D, Average
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate, Flatten
from keras.models import Model
from keras.utils import plot_model
from keras.callbacks import TensorBoard
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
EMBEDDING_FILE = 'glove.6B.300d.txt' # glove.twitter.27B.25d.txt
TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE = 'test.csv'
embed_size = 300 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 150 # max number of words in a comment to use
num = 'test'
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)
sentences_train = train["comment_text"].values.tolist()
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_t = train[list_classes].values
sentences_test = test["comment_text"].values.tolist()
def data_prepro(x_input):
# delete punctuation and duplicated space
whitelist = set('abcdefghijklmnopqrstuvwxyz 1234567890')
x_output = []
for m in x_input:
all_text = ''.join(filter(whitelist.__contains__, m.lower()))
text = ' '.join(all_text.split())
x_output.append(text)
return x_output
list_sentences_train = data_prepro(sentences_train)
list_sentences_test = data_prepro(sentences_test)
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list_sentences_train)
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
score = roc_auc_score(self.y_val, y_pred)
print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))
def get_coefs(word,*arr):
return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = Bidirectional(LSTM(64, return_sequences=True, return_state=False, dropout=0.1,
recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(32, activation="relu")(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
tensorboard = TensorBoard(log_dir='./keras_model/model{}/logs'.format(num), histogram_freq=0, write_graph=True,
write_images=True)
X_tra, X_val, y_tra, y_val = train_test_split(X_t, y_t, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
model.fit(X_tra, y_tra, batch_size=32, epochs=2, callbacks=[tensorboard, RocAuc])
MODEL_PATH = './keras_model/model{}/'.format(num)
model.save_weights(MODEL_PATH+'model.h5')
print("Saved weights to disk %s" % MODEL_PATH)
plot_model(model, to_file=MODEL_PATH+'graph.png')
print("Saved graph to disk %s" % MODEL_PATH)
# test
df_test_label = pd.read_csv('test_labels.csv')
df_y1 = df_test_label[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
y_test = df_y1.values
y_pred = model.predict(x=X_test, batch_size=64, verbose=1)
sample_submission = pd.read_csv('sample_submission.csv')
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission[list_classes] = y_pred
sample_submission.to_csv('submission.csv', index=False)