-
Notifications
You must be signed in to change notification settings - Fork 0
/
h_terms.py
143 lines (116 loc) · 4.25 KB
/
h_terms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 31 12:56:47 2017
@author: Vibhuti
"""
#Counting terms
import re
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
import json
with open('stream_HoustonFloods.json', 'r') as f:
for line in f:
# print(line)
tweet = json.loads(line)
tokens = preprocess(tweet['text'])
print(tokens)
#Remove stopwords
from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation) #creates a list of punctation
stop = stopwords.words('english') + punctuation + ['rt','RT' 'via','…']
#create a list of stop words
terms_stop = [term for term in preprocess(tweet['text']) if term not in stop]
import operator
import json
from collections import Counter
fname = 'stream_HoustonFloods.json'
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
# Create a list with stop words
terms_stop = [term for term in preprocess(tweet['text']) if term not in stop]
# Update the counter
count_all.update(terms_stop)
# Print the first 5 most frequent words
print(count_all.most_common(5))
#######Get the specific terms#############################
#Create the list of all terms
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
# Create a list with all the terms
terms_all = [term for term in preprocess(tweet['text'])]
terms_single = set(terms_all)
# Update the counter
count_all.update(terms_all)
# Print the first 5 most frequent words
print(count_all.most_common(5))
# Count terms only once, equivalent to Document Frequency
terms_single = set(terms_all)
# Count hashtags only
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
# Create a list with all the terms
# terms_all = [term for term in preprocess(tweet['text'])]
terms_hash = [term for term in preprocess(tweet['text'])
if term.startswith('#')]
# Update the counter
count_all.update(terms_hash)
# Print the first 5 most frequent words
print(count_all.most_common(5))
# Count terms only (no hashtags, no mentions)
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
# Create a list with all the terms
# terms_all = [term for term in preprocess(tweet['text'])]
terms_only = [term for term in preprocess(tweet['text'])
if term not in stop and
not term.startswith(('#', '@'))]
# Update the counter
count_all.update(terms_only)
# Print the first 5 most frequent words
print(count_all.most_common(5))
#Sequence of two terms
from nltk import bigrams
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
# Create a list with stop words
terms_stop = [term for term in preprocess(tweet['text']) if term not in stop]
terms_bigram = bigrams(terms_stop)
# Update the counter
count_all.update(terms_bigram)
# Print the first 5 most frequent words
print(count_all.most_common(5))