Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
kshashikumar committed Nov 18, 2021
0 parents commit 8842edb
Show file tree
Hide file tree
Showing 18 changed files with 6,411 additions and 0 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
5,588 changes: 5,588 additions & 0 deletions MajorProject/spam.csv

Large diffs are not rendered by default.

145 changes: 145 additions & 0 deletions MajorProject/spamFilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from nltk.corpus.reader.wordlist import WordListCorpusReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from math import log
import pandas as pd
import numpy as np


def process_mails(mail, lower_case=True, lemma=True, stem=True, stop_words=True, gram=2):
if lower_case:
mail = mail.lower()
words = word_tokenize(mail) #tokenize words
words = [w for w in words if len(w) > 2] #only consider words with len>2
w = []
if gram > 1:
for i in range(len(words)-gram+1):
w += [' '.join(words[i:i+gram])] #create list with pairs of words (gram=2)
return w
if stop_words:
sw = stopwords.words('english')
words = [word for word in words if word not in sw]
if lemma:
lemmatizer=WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
return w+words


class SpamClassifier(object):
def __init__(self, trainData):
self.mail = trainData['message']
self.label = trainData['label']

def train(self):
self.calc_TF_and_IDF()
self.calc_TF_IDF()

def calc_TF_and_IDF(self):
noOfMessages = self.mail.shape[0] #no of messages in trainData
self.spam_mails = self.label.value_counts()[1] #no of spam mails
self.ham_mails = self.label.value_counts()[0] #no of ham mails
self.total_mails = self.spam_mails+self.ham_mails #total no of mails
self.spam_words = 0
self.ham_words = 0
self.tf_spam = dict()
self.tf_ham = dict()
self.idf_spam = dict()
self.idf_ham = dict()
for i in range(noOfMessages):
mail_processed = process_mails(self.mail[i])
count = list()
for word in mail_processed:
if self.label[i]:
self.tf_spam[word] = self.tf_spam.get(word, 0)+1 #tf of spam words
self.spam_words += 1 #no of spam words
else:
self.tf_ham[word] = self.tf_ham.get(word, 0)+1 #tf of ham words
self.ham_words+1 #no of ham words
if word not in count:
count += [word] #list of unique words in a message
#add +1 if word exists in doc
for word in count:
if self.label[i]:
self.idf_spam[word] = self.idf_spam.get(word, 0)+1
else:
self.idf_ham[word] = self.idf_ham.get(word, 0)+1

def calc_TF_IDF(self):
self.prob_spam = dict()
self.prob_ham = dict()
self.sum_tf_idf_spam = 0
self.sum_tf_idf_ham = 0
for word in self.tf_spam:
self.prob_spam[word] = (self.tf_spam[word])*log((self.spam_mails+self.ham_mails)/(self.idf_spam[word]+self.idf_ham.get(word, 0)))
self.sum_tf_idf_spam += self.prob_spam[word]
for word in self.tf_spam:
self.prob_spam[word] = (self.prob_spam[word]+1)/(self.sum_tf_idf_spam+len(list(self.prob_spam.keys())))
for word in self.tf_ham:
self.prob_ham[word] = (self.tf_ham[word])*log((self.spam_mails+self.ham_mails)/(self.idf_spam.get(word, 0)+self.idf_ham[word]))
self.sum_tf_idf_ham += self.prob_ham[word]
for word in self.tf_ham:
self.prob_ham[word] = (self.prob_ham[word]+1)/(self.sum_tf_idf_ham+len(list(self.prob_ham.keys())))
self.prob_spam_mail = self.spam_mails/self.total_mails
self.prob_ham_mail = self.ham_mails/self.total_mails

def classify(self, processed_mail):
pSpam = 0
pHam = 0
for word in processed_mail:
if word in self.prob_spam:
pSpam += log(self.prob_spam[word])
else:
pSpam -= log(self.sum_tf_idf_spam + len(list(self.prob_spam.keys())))
if word in self.prob_ham:
pHam += log(self.prob_ham[word])
else:
pHam -= log(self.sum_tf_idf_ham + len(list(self.prob_ham.keys())))
pSpam += log(self.prob_spam_mail)
pHam += log(self.prob_ham_mail)
return pSpam >= pHam

def predict(self, testData):
result = dict()
for (i, mail) in enumerate(testData):
processed_mail = process_mails(mail)
result[i] = int(self.classify(processed_mail))
return result


def metrics(labels, predictions):
true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
for i in range(len(labels)):
true_pos += int(labels[i] == 1 and predictions[i] == 1)
true_neg += int(labels[i] == 0 and predictions[i] == 0)
false_pos += int(labels[i] == 0 and predictions[i] == 1)
false_neg += int(labels[i] == 1 and predictions[i] == 0)
precision = true_pos/(true_pos+false_pos)
recall = true_pos/(true_pos+false_neg)
Fscore = 2*precision*recall/(precision+recall)
accuracy = (true_pos+true_neg)/(true_pos+true_neg+false_pos+false_neg)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-score: ", Fscore)
print("Accuracy: ", accuracy)


mails = pd.read_csv('spam.csv', encoding='latin-1')
# print(mails.head())
mails.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
mails.rename(columns={'v1': 'labels', 'v2': 'message'}, inplace=True)
mails['label'] = mails['labels'].map({'ham': 0, 'spam': 1})
mails.drop(['labels'], axis=1, inplace=True)
totalMails = 5587
trainIndex, testIndex = list(), list()
for i in range(mails.shape[0]):
if np.random.uniform(0, 1) < 0.75:
trainIndex += [i]
else:
testIndex += [i]
trainData = mails.loc[trainIndex]
testData = mails.loc[testIndex]
trainData.reset_index(inplace=True)
trainData.drop(['index'], axis=1, inplace=True)
testData.reset_index(inplace=True)
testData.drop(['index'], axis=1, inplace=True)
109 changes: 109 additions & 0 deletions MajorProject/static/dist/css/home.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
*{
font-weight: 300;
}
nav, nav a, nav a:hover, nav a:active{
background-color: #eee2dc;
color: black;
}
nav a:hover{
font-weight: 400;
}
nav p{
margin: 0;
pointer-events: none;
}
.nav-content{
width: 90%;
margin: auto;
}

.user-name{
margin: 1%;
}
h5,h6,h4{
font-weight: 200;
}
.logout-button{
background-color: white;
color: #ac3b61;
border: 0.5px solid #ac3b61;
padding: 0.5% 1%;
}
.logout-button:hover{
background-color: #ac3b61;
color: white;
}
.home-card{
background-color: #eee2dc;
border: 0;
border-radius: 0;
margin: 2%;
color: #123c69;
}
.card-header{
background-color: #edc7b7;
color: #ac3b61;
}
.form{
padding: 2%;
}
.form-button{
margin: 2%;
margin-bottom: 0;
padding:1% 2%;
background-color: #ac3b61;
border: 0;
color: white;
}
.form-button:hover{
background-color: white;
color: #ac3b61;
border: 0.5px solid #ac3b61;
}
textarea{
resize: none;
}
.form-control{
border: 0;
border-radius: 0;
font-weight: 100;
}
.row{
margin-top: 2%;
}
.inbox-button{
padding: 2%;
margin: 2%;
background-color: white;
border: 1px solid #ac3b61;
color: #ac3b61;
border: 0;
}
.inbox-button:hover {
background-color: #ac3b61;
color: white;
}
.result_display{
font-weight: 200;
margin: 2% 2% 0 2%;
font-size: 1.25em;
}
.logout-card{
background-color: #eee2dc;
border: 0;
border-radius: 0;
margin: 10% 20%;
color: #123c69;
}
.check-form{
margin: 4%;
}
.sent-mails{
margin: 4%;
}
.link-dark:hover{
color: #ac3b61;
}
.showPassword{
text-align: left ;
}
49 changes: 49 additions & 0 deletions MajorProject/templates/base.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<!doctype html>
<html lang="en">

<head>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">

<!-- Bootstrap CSS -->
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
integrity="sha384-eOJMYsd53ii+scO/bJGFsiCZc+5NDVN2yr8+0RDqr0Ql0h+rP48ckxlpbzKgwra6" crossorigin="anonymous">
<link href="../static/dist/css/home.css" rel="stylesheet">

<title>MailBox</title>
</head>

<body class="home">
<nav class="navbar navbar-expand-lg">
<div class="container-fluid nav-content">
<p class="navbar-brand">MailBox</p>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNavAltMarkup"
aria-controls="navbarNavAltMarkup" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarNavAltMarkup">
<div class="navbar-nav">
<a class="nav-link" aria-current="page" href="/home">New Mail</a>
<a class="nav-link" aria-current="page" href="/inbox">Inbox</a>
<a class="nav-link" href="/check_login">Check Mail</a>
<a class="nav-link" href="/sentMails">Sent Mails</a>
</div>
</div>
<h6 class="user-name">{{user}}</h6>
<button onclick="location.href='/logout'" class="logout-button">Logout</button>
</div>
</nav>
{% block home %}
{% endblock %}
{% block inbox %}
{% endblock inbox %}
{% block content %}
{% endblock %}
{% block sentMails %}
{% endblock %}
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
integrity="sha384-JEW9xMcG8R+pH31jmWH6WWP0WintQrMb4s7ZOdauHnUtxwoG2vI5DkLtS3qm9Ekf"
crossorigin="anonymous"></script>
</body>
</html>
52 changes: 52 additions & 0 deletions MajorProject/templates/check.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<!doctype html>
<html lang="en">

<head>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">

<!-- Bootstrap CSS -->
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
integrity="sha384-eOJMYsd53ii+scO/bJGFsiCZc+5NDVN2yr8+0RDqr0Ql0h+rP48ckxlpbzKgwra6" crossorigin="anonymous">
<link href="../static/dist/css/home.css" rel="stylesheet">

<title>MailBox</title>
</head>
{% block content %}
<body>
<nav class="navbar navbar-expand-lg">
<div class="container-fluid nav-content">
<p class="navbar-brand">MailBox</p>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNavAltMarkup"
aria-controls="navbarNavAltMarkup" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarNavAltMarkup">
<div class="navbar-nav">
<a class="nav-link active" aria-current="page" href="/home">New Mail</a>
<a class="nav-link" href="/check">Check Mail</a>
<a class="nav-link" href="/sentMails">Sent Mails</a>
</div>
</div>
<button onclick="location.href='/login'" class="logout-button">Login</button>
</div>
</nav>
<div class="container">
<div class="card home-card">
<div class="card-header text-center"><h5>Check your message</h5></div>
<form method="POST" action="/result" class="form check-form">
<textarea name="message" rows="10" placeholder="Enter your message.." class="form-control">{{msg}}</textarea>
<div class="text-center">
<button class="form-button" type="submit">Check</button>
</div>
{% block body %}
{% endblock %}
</form>

</div>
</div>
</body>

{% endblock content %}
</html>
22 changes: 22 additions & 0 deletions MajorProject/templates/check_login.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<!DOCTYPE html>
{% extends 'base.html' %}

{% block content %}
<div class="container">
<div class="card home-card">
<div class="card-header text-center"><h5>Check your message</h5></div>
<form method="POST" action="/result" class="form check-form">
<textarea name="message" rows="10" placeholder="Enter your message.."
class="form-control">{{msg}}</textarea>
<div class="text-center">
<button type="submit" class="form-button">Check</button>
</div>
{% block body %}
{% endblock %}
</form>


</div>
</div>

{% endblock content %}
Loading

0 comments on commit 8842edb

Please sign in to comment.