-
Notifications
You must be signed in to change notification settings - Fork 1
/
sec3_proc.py
215 lines (182 loc) · 8.03 KB
/
sec3_proc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""Scripts to run the preprocessing and information parts of the paper."""
import json
# from langdetect import detect # --- see line 165
from sec3_data import DB
from sec3_data import reconstruct_ids
from sec3_data import log
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
import spacy
class AnnotationStats(object):
"""Factory to post annotation metrics from the original paper.
Parameters
----------
paper : bool
If true, limits the amount of annotations to those that were available
during the writing of the paper (1,456 annotations). If false, uses all
of the information.
labels : bool
If true, output general stats regarding the annotations (amount of
instances, males, females, other, unsure, and missing).
bots : bool
If true, output the percentage of bots in the sample.
agreement : bool
Calculate the agreement accuracy between the manually provided labels,
and those given by the distant query method.
interrater : bool
If true, include the Fleisch' Kappa agreement score between the first
and the second annotater if paper is true, otherwise for all three.
Parameters
----------
annotations : object
Loaded JSON file for the annotated twitter profiles from our paper.
paper : bool
See parameter description.
stats : dict
Dictionary representation for the information required to output the
statistics.
args : dict
Class arguments (kwargs).
"""
def __init__(self, **kwargs):
"""Open query database with annotations."""
self.annotations = json.load(open('./corpora/query-gender.json')
)['annotations']
self.paper = kwargs.get('paper', True)
if self.paper:
print("PLEASE NOTE: this information is for the annotation part " +
"only.\nThe models were trained on _all_ data.\n\n"
"Furthermore, v1 of the paper included kappa scores for " +
"'not sure'\nannotations, which should have been excluded " +
"(kappa = 0.90 then).\n")
self.annotations = {k: v for k, v in self.annotations.items()
if int(k) < 210040000}
else:
print("PLEASE NOTE: any statistics dealing with annotations in " +
"the paper\ndeal with a subset, reproducable by setting " +
"paper=True.\n\n")
self.stats = dict({'bots': 0, 'total': 0, 'm': 0, 'f': 0, 'o': 0,
'-': 0, '0': 0, 'distant': [], 'hand': [],
'ann': []})
self.calculate_stats()
self.args = kwargs
def calculate_stats(self):
"""Caclulate all relevant stats."""
raters = {'ann1': [], 'ann2': [], 'ann3': []}
for line in self.annotations.values():
if line['bot'] == 'True':
self.stats['bots'] += 1
if line['majority'] in self.stats:
self.stats[line['majority']] += 1
if line['majority'] in ['m', 'f'] and line['query_label2'] != '0':
self.stats['distant'].append(line['query_label2'])
self.stats['hand'].append(line['majority'])
for i in range(1, 4):
raters['ann' + str(i)].append(line['ann' + str(i)])
self.stats['total'] += 1
self.stats['raters'] = raters
def _kappa(self, raters, y1, y2):
ignore = ('0', 'o') if self.paper else ('0', 'o', '-')
simsc = [(x, y) for x, y in zip(raters[y1],
raters[y2])
if x not in ignore and y not in ignore]
return len(simsc), cohen_kappa_score(*zip(*simsc))
def report(self):
"""Report stats according to args provided in class init."""
s = self.stats
a = self.args
print("Amount of instances \t", len(self.annotations))
if a.get('labels'):
print("Amount of males \t", s['m'])
print("Amount of females \t", s['f'])
print("Amount of other \t", s['o'])
print("Amount of unsure \t", s['-'])
print("Amount of missing \t", s['0'])
if a.get('bots'):
print("Percentage of bots \t",
round(s['bots'] / s['total'] * 100, 1), '%')
if a.get('agreement'):
acc = accuracy_score(s['hand'], s['distant']) * 100
print("Agreement score \t", round(acc, 1))
if a.get('interrater'):
kapl12, kap12 = self._kappa(s['raters'], 'ann1', 'ann2')
print("Kappa 1 : 2 @ ", kapl12, "\t", round(kap12, 2))
if not self.paper:
kapl23, kap23 = self._kappa(s['raters'], 'ann2', 'ann3')
kapl13, kap13 = self._kappa(s['raters'], 'ann1', 'ann3')
print("Kappa 2 : 3 @ ", kapl23, "\t", round(kap23, 2))
print("Kappa 1 : 3 @ ", kapl13, "\t", round(kap13, 2))
def data_to_batches(db_id, label_mapping):
"""Convert messages in db to fasttext format and filter non-english.
Files are written to ./data/{db_id}.dataf.
Parameters
----------
db_id : str
The string identifier of the database to be converted to fasttext
format.
label_mapping : dict
A label -> int mapping to convert the labels to a number.
Returns
-------
None
"""
aff = '_fix' if 'query' in db_id or 'twitter' in db_id else ''
db = DB(db_id + '_msg' + aff, 'r')
ft = open('./data/' + db_id + '.dataf', 'w')
NLP = spacy.load('en')
user_ids, _ = reconstruct_ids(db_id)
batch, bin_lab = [], label_mapping
label, cur_user = str(), str()
log("Processing " + db_id + "...")
for line in db.loop():
prep = "" if not cur_user else "\n"
if cur_user != line['user_id']:
# log("Processing " + str(line['user_id']))
batch = []
try:
label = str(bin_lab[user_ids[line['user_id']]])
except KeyError:
# log("User error...")
continue
cur_user = line['user_id']
if label:
text = line['tweet_text'].replace('\n', ' ').replace('\t', ' ')
batch.append(text)
if len(batch) == 200:
tweets = '\t'.join(batch)
# NOTE: language detection was found to hurt performance
# if detect(tweets) == 'en':
tokens = ' '.join([t.text for t in NLP.tokenizer(tweets)])
tokens = tweets
if tokens[0] == ' ':
tokens = tokens[1:]
tokens = tokens.replace('\r', ' ')
log("Wrote tweet batch...")
ft.write(prep + '__label__{0} '.format(label) + tokens)
batch = []
ft.close()
def batches_to_sets(db_id, test_size=0.2):
"""Split fasttext batches into train and test."""
ft = open('./data/' + db_id + '.dataf', 'r')
batches = ft.read().split('\n')
n_test = round(len(batches) * test_size)
n_train = len(batches) - n_test
train = batches[:n_train]
test = batches[-n_test:]
with open('./data/' + db_id + '.train', 'w') as ftrain:
[ftrain.write(x + '\n') for x in train]
with open('./data/' + db_id + '.test', 'w') as ftest:
[ftest.write(x + '\n') for x in test]
if __name__ == "__main__":
ans = AnnotationStats(paper=True, labels=True, bots=True, agreement=True,
interrater=True)
ans.report()
lm = {'m': 0, 'f': 1, 'M': 0, 'F': 1}
# data_to_batches(db_id='twitter_gender', label_mapping=lm)
data_to_batches(db_id='query_gender', label_mapping=lm)
data_to_batches(db_id='plank_gender', label_mapping=lm)
data_to_batches(db_id='volkova_gender', label_mapping=lm)
# batches_to_sets(db_id='twitter_gender', test_size=0.2)
batches_to_sets(db_id='query_gender', test_size=0.2)
batches_to_sets(db_id='plank_gender', test_size=0.2)
batches_to_sets(db_id='volkova_gender', test_size=0.2)