-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpart1_fp.py
117 lines (90 loc) · 5.11 KB
/
part1_fp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from load_train_data import load_data
def analyze_predictions(model, X, y, texts, n_examples=7):
y_pred = model.predict(X)
y_proba = model.predict_proba(X)[:, 1]
false_positives = np.where((y_pred == 1) & (y == 0))[0]
false_negatives = np.where((y_pred == 0) & (y == 1))[0]
fp_indices = false_positives[np.argsort(y_proba[false_positives])[-n_examples:]]
fn_indices = false_negatives[np.argsort(y_proba[false_negatives])[:n_examples]]
return fp_indices, fn_indices, y_proba[fp_indices], y_proba[fn_indices]
def plot_examples(texts, indices, probas, title):
plt.figure(figsize=(8, len(indices) * 1)) # Increase height per entry for better spacing
for i, (idx, proba) in enumerate(zip(indices, probas)):
text = texts[idx]
# Adjust the y position to space out the text more evenly
plt.text(0.05, 1 - (i + 1) / (len(indices) + 1), f"{text} (Probability: {proba:.3f})",
fontsize=9, wrap=True, va='top')
plt.yticks([])
plt.xticks([])
# Set the title and tweak the layout to avoid excessive margins
plt.suptitle(title, fontsize=14, fontweight='bold', y=0.98)
plt.tight_layout(rect=[0, 0, 1, 0.96]) # Adjust to prevent overlap with title
plt.show()
def main():
x_train_df, y_train_df, x_test_df = load_data()
vectorizer = CountVectorizer(lowercase=True, min_df=1, max_df=0.415, binary=False)
x_train_bow = vectorizer.fit_transform(x_train_df['text'])
x_test_bow = vectorizer.transform(x_test_df['text'])
vocabulary_size = len(vectorizer.vocabulary_)
print(f"Vocabulary size: {vocabulary_size}")
total_words = x_train_bow.sum()
print(f"Total number of words in the training data: {total_words}")
y_train = y_train_df['is_positive_sentiment'].values
pipeline = Pipeline([
('vectorizer', CountVectorizer(lowercase=True, min_df=1, max_df=0.415, binary=False)),
('classifier', LogisticRegression(C=2.154434690031882, solver='lbfgs', max_iter=5000, random_state=12))
])
cross_validation = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
train_index, val_index = next(cross_validation.split(x_train_df['text'], y_train))
X_train, X_val = x_train_df['text'].iloc[train_index], x_train_df['text'].iloc[val_index]
y_train, y_val = y_train[train_index], y_train[val_index]
texts_val = X_val.values
websites_val = x_train_df['website_name'].iloc[val_index]
pipeline.fit(X_train, y_train)
# Analyze predictions
fp_indices, fn_indices, fp_probas, fn_probas = analyze_predictions(pipeline, X_val, y_val, texts_val)
# Plot examples
try:
plot_examples(texts_val, fp_indices, fp_probas, "False Positives")
plt.savefig('false_positives.png', dpi=300, bbox_inches='tight')
plt.close()
plot_examples(texts_val, fn_indices, fn_probas, "False Negatives")
plt.savefig('false_negatives.png', dpi=300, bbox_inches='tight')
plt.close()
except Exception as e:
print(f"Error occurred while plotting: {e}")
# Analyze performance on different review sources
y_proba_val = pipeline.predict_proba(X_val)[:, 1]
websites = ['amazon', 'imdb', 'yelp']
for website in websites:
website_mask = websites_val == website
if np.sum(website_mask) > 0:
auroc_website = roc_auc_score(y_val[website_mask], y_proba_val[website_mask])
print(f"AUROC for {website} reviews: {auroc_website:.4f}")
# Final prediction on test set
y_proba_test = pipeline.predict_proba(x_test_df['text'])[:, 1]
np.savetxt('yproba1_test.txt', y_proba_test)
# Additional analysis
sentence_lengths = [len(text.split()) for text in texts_val]
y_pred_val = pipeline.predict(X_val)
correct_predictions = y_pred_val == y_val
avg_length_correct = np.mean([length for length, correct in zip(sentence_lengths, correct_predictions) if correct])
avg_length_incorrect = np.mean([length for length, correct in zip(sentence_lengths, correct_predictions) if not correct])
print(f"Average sentence length for correct predictions: {avg_length_correct:.2f}")
print(f"Average sentence length for incorrect predictions: {avg_length_incorrect:.2f}")
# Analyze performance on sentences with/without negation
negation_words = ['not', 'no', 'never', "n't", 'without']
has_negation = np.array([any(word in text.lower() for word in negation_words) for text in texts_val])
auroc_with_negation = roc_auc_score(y_val[has_negation], pipeline.predict_proba(X_val[has_negation])[:, 1])
auroc_without_negation = roc_auc_score(y_val[~has_negation], pipeline.predict_proba(X_val[~has_negation])[:, 1])
print(f"AUROC for sentences with negation: {auroc_with_negation:.4f}")
print(f"AUROC for sentences without negation: {auroc_without_negation:.4f}")
if __name__ == '__main__':
main()