forked from ProjectSidewalk/SidewalkWebpage
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanonymize.py
90 lines (67 loc) · 2.99 KB
/
anonymize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import fileinput
import sys
def anonymize(sql_filename):
"""This function reads in the sql dump for the sidewalk project and replaces all the email addresses and usernames
with random strings.
References:
http://stackoverflow.com/questions/17140886/how-to-search-and-replace-text-in-a-file-using-python
:param sql_filename:
:return:
"""
terminate_line = "\."
copy_user_start_line = """COPY sidewalk_user (user_id, username, email) FROM stdin;"""
copy_login_info_start_line = """COPY login_info (login_info_id, provider_id, provider_key) FROM stdin;"""
read_user_table = False
read_login_info_table = False
output_file = open(sql_filename.replace(".sql", ".anonymized.sql"), "w")
with open(sql_filename, 'r') as input_file:
for line in input_file:
if copy_user_start_line in line:
read_user_table = True
continue
elif copy_login_info_start_line in line:
read_login_info_table = True
continue
elif terminate_line in line:
read_user_table = False
read_login_info_table = False
continue
new_line = line
# Substitute lines in the sidewalk_user table
if read_user_table:
line_list = line.split("\t")
if line_list[1] != "anonymous":
email_address = line_list[2].strip()
current_user_index = get_user_index(email_address)
line_list[1] = "anonymized_user_name." + str(current_user_index)
line_list[2] = "anonymized." + str(current_user_index) + "@email.com\n"
new_line = "\t".join(line_list)
# Substitute lines in the login_info table
if read_login_info_table:
line_list = line.split("\t")
email_address = line_list[2].strip()
if "[email protected]" not in line_list[2]:
current_user_index = get_user_index(email_address)
line_list[2] = "anonymized." + str(current_user_index) + "@email.com\n"
new_line = "\t".join(line_list)
output_file.write(new_line)
output_file.close()
return
def get_user_index(email_address):
if email_address in get_user_index.email_to_index:
return get_user_index.email_to_index[email_address]
else:
current_user_index = get_user_index.user_index
get_user_index.email_to_index[email_address] = current_user_index
get_user_index.user_index += 1
return current_user_index
get_user_index.user_index = 1
get_user_index.email_to_index = {}
if __name__ == '__main__':
if len(sys.argv) > 1:
sql_filename = sys.argv[1]
anonymize(sql_filename)
else:
# print "Filename not specified. Usage: python anonymize.py <filename of the sql file>"
sql_filename = "resources/sidewalk_20160629.sql"
anonymize(sql_filename)