-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8842edb
Showing
18 changed files
with
6,411 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
from nltk.corpus.reader.wordlist import WordListCorpusReader | ||
from nltk.tokenize import word_tokenize | ||
from nltk.corpus import stopwords | ||
from nltk.stem import WordNetLemmatizer | ||
from math import log | ||
import pandas as pd | ||
import numpy as np | ||
|
||
|
||
def process_mails(mail, lower_case=True, lemma=True, stem=True, stop_words=True, gram=2): | ||
if lower_case: | ||
mail = mail.lower() | ||
words = word_tokenize(mail) #tokenize words | ||
words = [w for w in words if len(w) > 2] #only consider words with len>2 | ||
w = [] | ||
if gram > 1: | ||
for i in range(len(words)-gram+1): | ||
w += [' '.join(words[i:i+gram])] #create list with pairs of words (gram=2) | ||
return w | ||
if stop_words: | ||
sw = stopwords.words('english') | ||
words = [word for word in words if word not in sw] | ||
if lemma: | ||
lemmatizer=WordNetLemmatizer() | ||
words = [lemmatizer.lemmatize(word) for word in words] | ||
return w+words | ||
|
||
|
||
class SpamClassifier(object): | ||
def __init__(self, trainData): | ||
self.mail = trainData['message'] | ||
self.label = trainData['label'] | ||
|
||
def train(self): | ||
self.calc_TF_and_IDF() | ||
self.calc_TF_IDF() | ||
|
||
def calc_TF_and_IDF(self): | ||
noOfMessages = self.mail.shape[0] #no of messages in trainData | ||
self.spam_mails = self.label.value_counts()[1] #no of spam mails | ||
self.ham_mails = self.label.value_counts()[0] #no of ham mails | ||
self.total_mails = self.spam_mails+self.ham_mails #total no of mails | ||
self.spam_words = 0 | ||
self.ham_words = 0 | ||
self.tf_spam = dict() | ||
self.tf_ham = dict() | ||
self.idf_spam = dict() | ||
self.idf_ham = dict() | ||
for i in range(noOfMessages): | ||
mail_processed = process_mails(self.mail[i]) | ||
count = list() | ||
for word in mail_processed: | ||
if self.label[i]: | ||
self.tf_spam[word] = self.tf_spam.get(word, 0)+1 #tf of spam words | ||
self.spam_words += 1 #no of spam words | ||
else: | ||
self.tf_ham[word] = self.tf_ham.get(word, 0)+1 #tf of ham words | ||
self.ham_words+1 #no of ham words | ||
if word not in count: | ||
count += [word] #list of unique words in a message | ||
#add +1 if word exists in doc | ||
for word in count: | ||
if self.label[i]: | ||
self.idf_spam[word] = self.idf_spam.get(word, 0)+1 | ||
else: | ||
self.idf_ham[word] = self.idf_ham.get(word, 0)+1 | ||
|
||
def calc_TF_IDF(self): | ||
self.prob_spam = dict() | ||
self.prob_ham = dict() | ||
self.sum_tf_idf_spam = 0 | ||
self.sum_tf_idf_ham = 0 | ||
for word in self.tf_spam: | ||
self.prob_spam[word] = (self.tf_spam[word])*log((self.spam_mails+self.ham_mails)/(self.idf_spam[word]+self.idf_ham.get(word, 0))) | ||
self.sum_tf_idf_spam += self.prob_spam[word] | ||
for word in self.tf_spam: | ||
self.prob_spam[word] = (self.prob_spam[word]+1)/(self.sum_tf_idf_spam+len(list(self.prob_spam.keys()))) | ||
for word in self.tf_ham: | ||
self.prob_ham[word] = (self.tf_ham[word])*log((self.spam_mails+self.ham_mails)/(self.idf_spam.get(word, 0)+self.idf_ham[word])) | ||
self.sum_tf_idf_ham += self.prob_ham[word] | ||
for word in self.tf_ham: | ||
self.prob_ham[word] = (self.prob_ham[word]+1)/(self.sum_tf_idf_ham+len(list(self.prob_ham.keys()))) | ||
self.prob_spam_mail = self.spam_mails/self.total_mails | ||
self.prob_ham_mail = self.ham_mails/self.total_mails | ||
|
||
def classify(self, processed_mail): | ||
pSpam = 0 | ||
pHam = 0 | ||
for word in processed_mail: | ||
if word in self.prob_spam: | ||
pSpam += log(self.prob_spam[word]) | ||
else: | ||
pSpam -= log(self.sum_tf_idf_spam + len(list(self.prob_spam.keys()))) | ||
if word in self.prob_ham: | ||
pHam += log(self.prob_ham[word]) | ||
else: | ||
pHam -= log(self.sum_tf_idf_ham + len(list(self.prob_ham.keys()))) | ||
pSpam += log(self.prob_spam_mail) | ||
pHam += log(self.prob_ham_mail) | ||
return pSpam >= pHam | ||
|
||
def predict(self, testData): | ||
result = dict() | ||
for (i, mail) in enumerate(testData): | ||
processed_mail = process_mails(mail) | ||
result[i] = int(self.classify(processed_mail)) | ||
return result | ||
|
||
|
||
def metrics(labels, predictions): | ||
true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0 | ||
for i in range(len(labels)): | ||
true_pos += int(labels[i] == 1 and predictions[i] == 1) | ||
true_neg += int(labels[i] == 0 and predictions[i] == 0) | ||
false_pos += int(labels[i] == 0 and predictions[i] == 1) | ||
false_neg += int(labels[i] == 1 and predictions[i] == 0) | ||
precision = true_pos/(true_pos+false_pos) | ||
recall = true_pos/(true_pos+false_neg) | ||
Fscore = 2*precision*recall/(precision+recall) | ||
accuracy = (true_pos+true_neg)/(true_pos+true_neg+false_pos+false_neg) | ||
print("Precision: ", precision) | ||
print("Recall: ", recall) | ||
print("F-score: ", Fscore) | ||
print("Accuracy: ", accuracy) | ||
|
||
|
||
mails = pd.read_csv('spam.csv', encoding='latin-1') | ||
# print(mails.head()) | ||
mails.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True) | ||
mails.rename(columns={'v1': 'labels', 'v2': 'message'}, inplace=True) | ||
mails['label'] = mails['labels'].map({'ham': 0, 'spam': 1}) | ||
mails.drop(['labels'], axis=1, inplace=True) | ||
totalMails = 5587 | ||
trainIndex, testIndex = list(), list() | ||
for i in range(mails.shape[0]): | ||
if np.random.uniform(0, 1) < 0.75: | ||
trainIndex += [i] | ||
else: | ||
testIndex += [i] | ||
trainData = mails.loc[trainIndex] | ||
testData = mails.loc[testIndex] | ||
trainData.reset_index(inplace=True) | ||
trainData.drop(['index'], axis=1, inplace=True) | ||
testData.reset_index(inplace=True) | ||
testData.drop(['index'], axis=1, inplace=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
*{ | ||
font-weight: 300; | ||
} | ||
nav, nav a, nav a:hover, nav a:active{ | ||
background-color: #eee2dc; | ||
color: black; | ||
} | ||
nav a:hover{ | ||
font-weight: 400; | ||
} | ||
nav p{ | ||
margin: 0; | ||
pointer-events: none; | ||
} | ||
.nav-content{ | ||
width: 90%; | ||
margin: auto; | ||
} | ||
|
||
.user-name{ | ||
margin: 1%; | ||
} | ||
h5,h6,h4{ | ||
font-weight: 200; | ||
} | ||
.logout-button{ | ||
background-color: white; | ||
color: #ac3b61; | ||
border: 0.5px solid #ac3b61; | ||
padding: 0.5% 1%; | ||
} | ||
.logout-button:hover{ | ||
background-color: #ac3b61; | ||
color: white; | ||
} | ||
.home-card{ | ||
background-color: #eee2dc; | ||
border: 0; | ||
border-radius: 0; | ||
margin: 2%; | ||
color: #123c69; | ||
} | ||
.card-header{ | ||
background-color: #edc7b7; | ||
color: #ac3b61; | ||
} | ||
.form{ | ||
padding: 2%; | ||
} | ||
.form-button{ | ||
margin: 2%; | ||
margin-bottom: 0; | ||
padding:1% 2%; | ||
background-color: #ac3b61; | ||
border: 0; | ||
color: white; | ||
} | ||
.form-button:hover{ | ||
background-color: white; | ||
color: #ac3b61; | ||
border: 0.5px solid #ac3b61; | ||
} | ||
textarea{ | ||
resize: none; | ||
} | ||
.form-control{ | ||
border: 0; | ||
border-radius: 0; | ||
font-weight: 100; | ||
} | ||
.row{ | ||
margin-top: 2%; | ||
} | ||
.inbox-button{ | ||
padding: 2%; | ||
margin: 2%; | ||
background-color: white; | ||
border: 1px solid #ac3b61; | ||
color: #ac3b61; | ||
border: 0; | ||
} | ||
.inbox-button:hover { | ||
background-color: #ac3b61; | ||
color: white; | ||
} | ||
.result_display{ | ||
font-weight: 200; | ||
margin: 2% 2% 0 2%; | ||
font-size: 1.25em; | ||
} | ||
.logout-card{ | ||
background-color: #eee2dc; | ||
border: 0; | ||
border-radius: 0; | ||
margin: 10% 20%; | ||
color: #123c69; | ||
} | ||
.check-form{ | ||
margin: 4%; | ||
} | ||
.sent-mails{ | ||
margin: 4%; | ||
} | ||
.link-dark:hover{ | ||
color: #ac3b61; | ||
} | ||
.showPassword{ | ||
text-align: left ; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
<!doctype html> | ||
<html lang="en"> | ||
|
||
<head> | ||
<!-- Required meta tags --> | ||
<meta charset="utf-8"> | ||
<meta name="viewport" content="width=device-width, initial-scale=1"> | ||
|
||
<!-- Bootstrap CSS --> | ||
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" | ||
integrity="sha384-eOJMYsd53ii+scO/bJGFsiCZc+5NDVN2yr8+0RDqr0Ql0h+rP48ckxlpbzKgwra6" crossorigin="anonymous"> | ||
<link href="../static/dist/css/home.css" rel="stylesheet"> | ||
|
||
<title>MailBox</title> | ||
</head> | ||
|
||
<body class="home"> | ||
<nav class="navbar navbar-expand-lg"> | ||
<div class="container-fluid nav-content"> | ||
<p class="navbar-brand">MailBox</p> | ||
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNavAltMarkup" | ||
aria-controls="navbarNavAltMarkup" aria-expanded="false" aria-label="Toggle navigation"> | ||
<span class="navbar-toggler-icon"></span> | ||
</button> | ||
<div class="collapse navbar-collapse" id="navbarNavAltMarkup"> | ||
<div class="navbar-nav"> | ||
<a class="nav-link" aria-current="page" href="/home">New Mail</a> | ||
<a class="nav-link" aria-current="page" href="/inbox">Inbox</a> | ||
<a class="nav-link" href="/check_login">Check Mail</a> | ||
<a class="nav-link" href="/sentMails">Sent Mails</a> | ||
</div> | ||
</div> | ||
<h6 class="user-name">{{user}}</h6> | ||
<button onclick="location.href='/logout'" class="logout-button">Logout</button> | ||
</div> | ||
</nav> | ||
{% block home %} | ||
{% endblock %} | ||
{% block inbox %} | ||
{% endblock inbox %} | ||
{% block content %} | ||
{% endblock %} | ||
{% block sentMails %} | ||
{% endblock %} | ||
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" | ||
integrity="sha384-JEW9xMcG8R+pH31jmWH6WWP0WintQrMb4s7ZOdauHnUtxwoG2vI5DkLtS3qm9Ekf" | ||
crossorigin="anonymous"></script> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
<!doctype html> | ||
<html lang="en"> | ||
|
||
<head> | ||
<!-- Required meta tags --> | ||
<meta charset="utf-8"> | ||
<meta name="viewport" content="width=device-width, initial-scale=1"> | ||
|
||
<!-- Bootstrap CSS --> | ||
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" | ||
integrity="sha384-eOJMYsd53ii+scO/bJGFsiCZc+5NDVN2yr8+0RDqr0Ql0h+rP48ckxlpbzKgwra6" crossorigin="anonymous"> | ||
<link href="../static/dist/css/home.css" rel="stylesheet"> | ||
|
||
<title>MailBox</title> | ||
</head> | ||
{% block content %} | ||
<body> | ||
<nav class="navbar navbar-expand-lg"> | ||
<div class="container-fluid nav-content"> | ||
<p class="navbar-brand">MailBox</p> | ||
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNavAltMarkup" | ||
aria-controls="navbarNavAltMarkup" aria-expanded="false" aria-label="Toggle navigation"> | ||
<span class="navbar-toggler-icon"></span> | ||
</button> | ||
<div class="collapse navbar-collapse" id="navbarNavAltMarkup"> | ||
<div class="navbar-nav"> | ||
<a class="nav-link active" aria-current="page" href="/home">New Mail</a> | ||
<a class="nav-link" href="/check">Check Mail</a> | ||
<a class="nav-link" href="/sentMails">Sent Mails</a> | ||
</div> | ||
</div> | ||
<button onclick="location.href='/login'" class="logout-button">Login</button> | ||
</div> | ||
</nav> | ||
<div class="container"> | ||
<div class="card home-card"> | ||
<div class="card-header text-center"><h5>Check your message</h5></div> | ||
<form method="POST" action="/result" class="form check-form"> | ||
<textarea name="message" rows="10" placeholder="Enter your message.." class="form-control">{{msg}}</textarea> | ||
<div class="text-center"> | ||
<button class="form-button" type="submit">Check</button> | ||
</div> | ||
{% block body %} | ||
{% endblock %} | ||
</form> | ||
|
||
</div> | ||
</div> | ||
</body> | ||
|
||
{% endblock content %} | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
<!DOCTYPE html> | ||
{% extends 'base.html' %} | ||
|
||
{% block content %} | ||
<div class="container"> | ||
<div class="card home-card"> | ||
<div class="card-header text-center"><h5>Check your message</h5></div> | ||
<form method="POST" action="/result" class="form check-form"> | ||
<textarea name="message" rows="10" placeholder="Enter your message.." | ||
class="form-control">{{msg}}</textarea> | ||
<div class="text-center"> | ||
<button type="submit" class="form-button">Check</button> | ||
</div> | ||
{% block body %} | ||
{% endblock %} | ||
</form> | ||
|
||
|
||
</div> | ||
</div> | ||
|
||
{% endblock content %} |
Oops, something went wrong.