-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsvm.py
72 lines (61 loc) · 2.69 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 19 00:41:33 2018
@author: carlocarandang
"""
import nltk
import random
from re import split
from nltk.corpus import stopwords
documents = []
# Dataset contains sentences labelled with positive or negative sentiment, extracted from reviews of products, movies, and restaurants
read_file = "/Users/carlocarandang/Desktop/Data Mining/NLP-Sentiment Analysis/sentiment labelled sentences/amazon_cells_labelled.txt"
# Format of dataset: sentence \t score \n
with open(read_file,"r") as r:
c=0
for line in r:
#create the training document of labeled tuples
splitted = line.strip().split('\t')
msg = (' ').join(splitted[:-1])
is_class = splitted[-1]
documents.extend([ dict(doc=msg.lower(), category=is_class)]) #add a dict value which is all the words/tokens
for n in range(len(documents)):
documents[n]['words'] = split('\W+', documents[n]['doc'])
#take out stopwords and any other garbage like numbers and all
all_words = nltk.FreqDist(w.lower() for d in documents for w in d['words'] if w not in stopwords.words() and not w.isdigit())
#experiment with different amounts of features
word_features = all_words.keys()[:2500]
#get features labeled with true or false (words in this case)
def document_features(document):
#this if statement is only needed if you’re using a saved classifier
if not document.get('words'):
document['words'] = split('\W+', document['doc']) #split into words
document_words = set(document['words']) #unique them
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
random.shuffle(documents)
#get featuresets for each document
featuresets = [(document_features(d), d['category']) for d in documents]
# K-Fold cross validation (n=10) of training and testing sets
import numpy as np
import nltk.classify
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
sum = 0
for train, test in kf.split(featuresets):
train_data = np.array(featuresets)[train]
test_data = np.array(featuresets)[test]
classifier = nltk.classify.SklearnClassifier(LinearSVC())
classifier.train(train_data)
sum += nltk.classify.accuracy(classifier, test_data)
average = sum/10
print "-------------"
print "The accuracy of the SVM Classifier with 10-Fold Validations is: ", average
print "-------------------"
print "Testing the review, 'does not work': ", classifier.classify(document_features({'doc':"does not work"}))
print "-------------------"
print "Testing the review, 'works well': ", classifier.classify(document_features({'doc':"works well"}))