forked from FrederikHennecke/DeepLearning4NLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbart_detection.py
362 lines (292 loc) · 13.3 KB
/
bart_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import argparse
import random
import numpy as np
import pandas as pd
import csv
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import AutoTokenizer, BartModel
from multitask_classifier import split_csv
from sklearn.metrics import matthews_corrcoef
from optimizer import AdamW
from sophia import SophiaG
from datasets import preprocess_string
import costum_loss
from bart_generation import add_synonyms_to_dataframe, add_noise_to_sentence
TQDM_DISABLE = False
class BartWithClassifier(nn.Module):
def __init__(self, num_labels=7):
super(BartWithClassifier, self).__init__()
self.bart = BartModel.from_pretrained(
"facebook/bart-large", local_files_only=True
)
self.classifier = nn.Linear(self.bart.config.hidden_size, num_labels)
self.sigmoid = nn.Sigmoid()
def forward(self, input_ids, attention_mask=None):
# Use the BartModel to obtain the last hidden state
outputs = self.bart(input_ids=input_ids, attention_mask=attention_mask)
last_hidden_state = outputs.last_hidden_state
cls_output = last_hidden_state[:, 0, :]
# Add an additional fully connected layer to obtain the logits
logits = self.classifier(cls_output)
# Return the probabilities
probabilities = self.sigmoid(logits)
return probabilities
def transform_data(
dataset,
batch_size,
shuffle,
max_length=512,
):
"""
dataset: pd.DataFrame
Turn the data to the format you want to use.
1. Extract the sentences from the dataset. We recommend using the already split
sentences in the dataset.
2. Use the AutoTokenizer from_pretrained to tokenize the sentences and obtain the
input_ids and attention_mask.
3. Currently, the labels are in the form of [2, 5, 6, 0, 0, 0, 0]. This means that
the sentence pair is of type 2, 5, and 6. Turn this into a binary form, where the
label becomes [0, 1, 0, 0, 1, 1, 0]. Be careful that the test-student.csv does not
have the paraphrase_types column. You should return a DataLoader without the labels.
4. Use the input_ids, attention_mask, and binary labels to create a TensorDataset.
Return a DataLoader with the TensorDataset. You can choose a batch size of your
choice.
"""
# raise NotImplementedError
tokenizer = AutoTokenizer.from_pretrained(
"facebook/bart-large", local_files_only=True
)
sentences1 = dataset["sentence1"].tolist()
sentences2 = dataset["sentence2"].tolist()
has_labels = "paraphrase_types" in dataset.columns
if has_labels:
labels = (
dataset["paraphrase_types"]
.apply(lambda x: list(map(int, x.strip("[]").split(", "))))
.tolist()
)
binary_labels = [
[1 if i in label else 0 for i in range(1, 8)] for label in labels
] # number of labels = 7
else:
binary_labels = None
encodings = tokenizer(
sentences1,
sentences2,
truncation=True,
padding=True,
max_length=max_length,
)
input_ids = torch.tensor(encodings["input_ids"])
attention_mask = torch.tensor(encodings["attention_mask"])
if binary_labels:
labels_tensors = torch.tensor(binary_labels)
dataset = TensorDataset(input_ids, attention_mask, labels_tensors)
else:
dataset = TensorDataset(input_ids, attention_mask)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
return dataloader, tokenizer
def train_model(model, train_data, dev_data, device, tokenizer, args):
"""
Train the model. You can use any training loop you want. We recommend starting with
AdamW as your optimizer. You can take a look at the SST training loop for reference.
Think about your loss function and the number of epochs you want to train for.
You can also use the evaluate_model function to evaluate the
model on the dev set. Print the training loss, training accuracy, and dev accuracy at
the end of each epoch.
Return the trained model.
"""
### TODO
# raise NotImplementedError
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=args.lr)
pos_weight = torch.tensor([3.78436018957346, 0.5, 3.717289719626168, 4.869186046511628, 2.83111954459203, 0.5, 0.36], device=device)
# [3.78436018957346, 0.2140709561034275, 3.717289719626168, 4.869186046511628, 2.83111954459203, 0.0039781203381402, 0.2310975609756098]
# loss_fun = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
loss_fun = costum_loss.CustomLoss(pos_weight=pos_weight)
# Run for the specified number of epochs
for epoch in range(args.epochs):
model.train()
train_loss = 0
num_batches = 0
total_examples = 0
correct_preds = 0
for batch in tqdm(train_data, desc=f"train-{epoch+1:02}", disable=TQDM_DISABLE):
# print(f"batch: {batch}")
b_ids, b_mask, b_labels = batch
b_ids = b_ids.to(device)
b_mask = b_mask.to(device)
b_labels = b_labels.to(device)
# Decode the input_ids to text, apply noise, then re-encode
original_sentences = tokenizer.batch_decode(b_ids, skip_special_tokens=True)
noisy_sentences = [add_noise_to_sentence(sentence, noise_level=args.noise, tokenizer=tokenizer) for sentence in original_sentences]
noisy_encodings = tokenizer(noisy_sentences, padding=True, truncation=True, max_length=b_ids.size(1), return_tensors="pt")
noisy_b_ids = noisy_encodings.input_ids.to(device)
noisy_b_mask = noisy_encodings.attention_mask.to(device)
optimizer.zero_grad()
logits = model(noisy_b_ids, noisy_b_mask)
loss = loss_fun(logits, b_labels.float())
loss.backward()
optimizer.step()
train_loss += loss.item()
num_batches += 1
total_examples += b_labels.size(0) * b_labels.size(
1
) # total number of examples per batch
preds = logits.round()
correct_preds += (preds == b_labels).sum().item()
avg_train_loss = train_loss / num_batches
train_accuracy = correct_preds / total_examples
dev_accuracy, matthews_coefficient = evaluate_model(model, dev_data, device)
print(
f"Epoch {epoch+1:02} | Train Loss: {avg_train_loss:.4f} | Train Accuracy: {train_accuracy:.4f} | Dev Accuracy: {dev_accuracy:.4f} | dev matthews_coefficient: {matthews_coefficient:.4f}"
)
return model
def test_model(model, test_data, test_ids, device):
"""
Test the model. Predict the paraphrase types for the given sentences and return the results in form of
a Pandas dataframe with the columns 'id' and 'Predicted_Paraphrase_Types'.
The 'Predicted_Paraphrase_Types' column should contain the binary array of your model predictions.
Return this dataframe.
"""
### TODO
# raise NotImplementedError
model.to(device)
model.eval()
all_preds = []
all_logits = []
with torch.no_grad():
for batch in tqdm(test_data, desc="test", disable=TQDM_DISABLE):
b_ids, b_mask = batch
b_ids = b_ids.to(device)
b_mask = b_mask.to(device)
logits = model(b_ids, b_mask)
preds = logits.round().cpu().numpy()
all_preds.extend(preds)
logits = logits.cpu().numpy()
all_logits.extend(logits)
pred_paraphrase_types = [[int(x) for x in pred] for pred in all_preds]
logit_paraphrase_types = [[float(x) for x in logit] for logit in all_logits]
df_test_results = pd.DataFrame(
{"id": test_ids, "Predicted_Paraphrase_Types": pred_paraphrase_types, "logits": logit_paraphrase_types}
)
return df_test_results
def evaluate_model(model, test_data, device):
"""
This function measures the accuracy of our model's prediction on a given train/validation set
We measure how many of the seven paraphrase types the model has predicted correctly for each data point.
So, if the models prediction is [1,1,0,0,1,1,0] and the true label is [0,0,0,0,1,1,0], this predicition
has an accuracy of 5/7, i.e. 71.4% .
"""
all_pred = []
all_labels = []
model.eval() # switch to eval model, will turn off randomness like dropout
with torch.no_grad():
for batch in test_data:
input_ids, attention_mask, labels = batch
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
predicted_labels = (outputs > 0.5).int()
all_pred.append(predicted_labels)
all_labels.append(labels)
all_predictions = torch.cat(all_pred, dim=0)
all_true_labels = torch.cat(all_labels, dim=0)
true_labels_np = all_true_labels.cpu().numpy()
predicted_labels_np = all_predictions.cpu().numpy()
# Compute the accuracy for each label
accuracies = []
matthews_coefficients = []
for label_idx in range(true_labels_np.shape[1]):
correct_predictions = np.sum(
true_labels_np[:, label_idx] == predicted_labels_np[:, label_idx]
)
total_predictions = true_labels_np.shape[0]
label_accuracy = correct_predictions / total_predictions
accuracies.append(label_accuracy)
#compute Matthwes Correlation Coefficient for each paraphrase type
matth_coef = matthews_corrcoef(true_labels_np[:,label_idx], predicted_labels_np[:,label_idx])
matthews_coefficients.append(matth_coef)
# Calculate the average accuracy over all labels
accuracy = np.mean(accuracies)
matthews_coefficient = np.mean(matthews_coefficients)
model.train()
return accuracy, matthews_coefficient
def seed_everything(seed=11711):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=11711)
parser.add_argument("--use_gpu", action="store_true")
parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--epochs", type=int, default=1)
parser.add_argument("--lr", type=float, default=1e-5)
# parser.add_argument("--lr_steps", type=int, default=10)
parser.add_argument(
"--etpc_train", type=str, default="data/etpc-paraphrase-train.csv"
)
parser.add_argument("--etpc_dev", type=str, default="data/etpc-paraphrase-dev.csv")
parser.add_argument(
"--etpc_test",
type=str,
default="data/etpc-paraphrase-detection-test-student.csv",
)
parser.add_argument("--type_2", type=float, default=0.3)
parser.add_argument("--type_6", type=float, default=0.3)
parser.add_argument("--type_7", type=float, default=0.3)
parser.add_argument("--noise", type=float, default=0.)
parser.add_argument("--synonym_prob", type=float, default=0.)
args = parser.parse_args()
return args
def finetune_paraphrase_detection(args):
model = BartWithClassifier()
device = torch.device("cuda") if args.use_gpu else torch.device("cpu")
model.to(device)
train_dataset = pd.read_csv(
args.etpc_train,
sep="\t",
usecols=["sentence1", "sentence2", "paraphrase_types"],
)
dev_dataset = pd.read_csv(
args.etpc_dev,
sep="\t",
usecols=["sentence1", "sentence2", "paraphrase_types"],
)
test_dataset = pd.read_csv(
args.etpc_test,
sep="\t",
usecols=["id", "sentence1", "sentence2"],
)
# TODO You might do a split of the train data into train/validation set here
# (or in the csv files directly)
# Already Done before!
train_dataset = add_synonyms_to_dataframe(train_dataset, prob=args.synonym_prob)
train_data, tokenizer = transform_data(train_dataset, args.batch_size, shuffle=True)
dev_data, _ = transform_data(dev_dataset, args.batch_size, shuffle=False)
test_data, _ = transform_data(test_dataset, args.batch_size, shuffle=False)
model = train_model(model, train_data, dev_data, device, tokenizer, args)
print("Training finished.")
accuracy, matthews_corr = evaluate_model(model, train_data, device)
print(f"The accuracy of the train_data is: {accuracy:.3f}")
print(f"Matthews Correlation Coefficient of the train_data is: {matthews_corr:.3f}")
test_ids = test_dataset["id"]
test_results = test_model(model, test_data, test_ids, device)
test_results.to_csv(
"predictions/bart/etpc-paraphrase-detection-test-output.csv",
index=False,
sep="\t",
)
if __name__ == "__main__":
args = get_args()
seed_everything(args.seed)
finetune_paraphrase_detection(args)