-
Notifications
You must be signed in to change notification settings - Fork 137
/
Copy pathsentiment_analysis.py
126 lines (105 loc) · 5.35 KB
/
sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re, math, collections, itertools, os
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
POLARITY_DATA_DIR = os.path.join('polarityData', 'rt-polaritydata')
RT_POLARITY_POS_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-pos.txt')
RT_POLARITY_NEG_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-neg.txt')
#this function takes a feature selection mechanism and returns its performance in a variety of metrics
def evaluate_features(feature_select):
posFeatures = []
negFeatures = []
#http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
#breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
for i in posSentences:
posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords = [feature_select(posWords), 'pos']
posFeatures.append(posWords)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords = [feature_select(negWords), 'neg']
negFeatures.append(negWords)
#selects 3/4 of the features to be used for training and 1/4 to be used for testing
posCutoff = int(math.floor(len(posFeatures)*3/4))
negCutoff = int(math.floor(len(negFeatures)*3/4))
trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
#trains a Naive Bayes Classifier
classifier = NaiveBayesClassifier.train(trainFeatures)
#initiates referenceSets and testSets
referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)
#puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
for i, (features, label) in enumerate(testFeatures):
referenceSets[label].add(i)
predicted = classifier.classify(features)
testSets[predicted].add(i)
#prints metrics to show how well the feature selection did
print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
classifier.show_most_informative_features(10)
#creates a feature selection mechanism that uses all words
def make_full_dict(words):
return dict([(word, True) for word in words])
#tries using all words as the feature selection mechanism
print 'using all words as features'
evaluate_features(make_full_dict)
#scores words based on chi-squared test to show information gain (http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/)
def create_word_scores():
#creates lists of all positive and negative words
posWords = []
negWords = []
with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
for i in posSentences:
posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords.append(posWord)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords.append(negWord)
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
cond_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
cond_word_fd['neg'][word.lower()] += 1
#finds the number of positive and negative words, as well as the total number of words
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
#builds dictionary of word scores based on chi-squared test
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
#finds word scores
word_scores = create_word_scores()
#finds the best 'number' words based on word scores
def find_best_words(word_scores, number):
best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number]
best_words = set([w for w, s in best_vals])
return best_words
#creates feature selection mechanism that only uses best words
def best_word_features(words):
return dict([(word, True) for word in words if word in best_words])
#numbers of features to select
numbers_to_test = [10, 100, 1000, 10000, 15000]
#tries the best_word_features mechanism with each of the numbers_to_test of features
for num in numbers_to_test:
print 'evaluating best %d word features' % (num)
best_words = find_best_words(word_scores, num)
evaluate_features(best_word_features)