-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathusers.py
153 lines (123 loc) · 3.88 KB
/
users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import re
import requests
import collections
def get_users(content):
users = {}
regex = '\d\d:\d\d\s-\s(.*?):\s'
for x in content:
x = x.rstrip()
user = re.findall(regex, x)
if len(user) == 1:
if not user[0] in users and not user[0] == []:
users[user[0]] = {}
return users
def get_gender(users):
genders = []
urls = []
for i in range(len(users)):
api_str = ''
for j in range(10):
index = i * 10 + j
if index >= len(users):
break
api_str += 'name=' + list(users)[index] + '&'
urls.append('https://api.genderize.io/?' + api_str)
if index >= len(users):
break
for url in urls:
json = requests.get(url=url).json()
for r in json:
if(r['gender'] != None):
genders.append(r['gender'])
else:
genders.append('Not identified')
return genders
def get_messages(content, user):
messages = []
regex = user + ': (.*)'
for x in content:
x = x.rstrip()
msg = re.findall(regex, x)
if(len(msg) != 0):
messages.append(msg[0])
return messages
def get_nfiles(messages):
n = 0
for msg in messages:
if msg == '<Archivo omitido>':
n += 1
return n
def get_ratio(n_messages, elem):
'''
Return the number of elem(files or links) sent every 100 messages
'''
return elem * 100 / n_messages
def get_n_emojis(messages):
n = len(re.findall(r'[\U0001f600-\U0001f650]', ''.join(messages)))
return n
def get_longest_message(messages):
return max(messages, key=len)
def get_words(messages):
'''
Returns the list of words by user
'''
words = []
for x in messages:
for y in x.split(' '):
words.append(y)
return words
def get_most_common_words(words):
'''
Return the word that most repeat word
'''
counter = collections.Counter(words)
order_words = counter.most_common()
return order_words
def get_nlinks(messages):
n_links = 0
for m in messages:
n = re.findall(
'http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[\/.?&+!*=\-])+(?![^,!;:\s)])', m)
n_links = n_links + len(n)
return n_links
def get_mpd(n, days):
'''
Return the average of messages sent in a day
Note: mpd means messages per day
'''
return n / days
def main(content, metadata):
users = get_users(content)
genders = get_gender(users)
for i, user in enumerate(users):
messages = get_messages(content, user)
n_messages = len(messages)
if(n_messages == 0):
most_common_words = ''
n_files = ratio_files_and_messages = n_links = ratio_links_and_messages = mpd = 0
else:
longest_message = get_longest_message(messages)
len_longest_message = len(longest_message)
n_emojis = get_n_emojis(messages)
words = get_words(messages)
most_common_words = get_most_common_words(words)
n_files = get_nfiles(messages)
ratio_files_and_messages = get_ratio(n_messages, n_files)
n_links = get_nlinks(messages)
ratio_links_and_messages = get_ratio(n_messages, n_links)
mpd = get_mpd(n_messages, metadata['days'])
users[user] = {
'gender': genders[i],
'messages': messages,
'n_messages': n_messages,
'longest_message': longest_message,
'len_longest_message': len_longest_message,
'n_emojis': n_emojis,
'most_common_word': most_common_words,
'n_files': n_files,
'ratio_files_and_messages': ratio_files_and_messages,
'n_links': n_links,
'ratio_links_and_messages': ratio_links_and_messages,
'mpd': mpd
}
return users