-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdefault.py
222 lines (189 loc) · 10.4 KB
/
default.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# coding: utf-8
"""
This file is the starting-point for the script(s).
Documentation is currently only available in Swedish at http://verifierad.nu
- which redirects to the official Github repository.
A change-log is kept in the file CHANGELOG.md
"""
from datetime import datetime
import sys
import _privatekeys as privatekeys
import test
import helper
from checks.google_pagespeed import google_pagespeed_check, check_lighthouse
# from checks.content import content_check, find_string # uncomment this line to try the preview of content checks
# local variables
# url_for_mainProcess = 'http://vgregion.se/'
i = 1 # global iteration counter
def oneOffProcess(file, test_regime='httpStatusCodeCheck'):
"""
Inspects a textfile, assuming there's URLs in there, one URL per line.
attributes: file path to open
"""
f = open(file, 'r')
urlsInTextfile = []
iteration_counter = 1
keep_on = True;
time_to_sleep_in_seconds = 90 # TODO: reda ut varför Mobile Friendly inte orkar testa flera på raken, begränsning?
output_file = ""
i = 1
while keep_on:
url = f.readline().replace('\n', '')
mess_to_console = '{0}. {1}'.format(iteration_counter, url)
if len(url) < 7: # break if line is shorter than seven characters
keep_on = False
elif not url.endswith('.pdf'):
# depending on which test regime is chosen
if test_regime == 'httpStatusCodeCheck':
status_code = test.httpStatusCodeCheck(url, False)
print('{0} has a status code: {1}'.format(mess_to_console,
status_code).replace('\n', ''))
output_file += '{0}, {1}\n'.format(url.replace('\n', ''), status_code)
elif test_regime == 'sitemapCheck':
"""
Check the status code of domain.tld/sitemap.xml, assuming URL to only be the domain, not an URI
"""
if url[-1:] is '/':
url = url[:-1]
url = '{0}/{1}'.format(url, 'sitemap.xml')
status_code = test.httpStatusCodeCheck(url, False)
print('{0} has a status code: {1}'.format(mess_to_console,
status_code).replace('\n', ''))
is_sitemap = "undefined"
if str(status_code)[:1] is "2" or str(status_code)[:1] is "3": # checking if status code is either 200 series or 300
is_sitemap = helper.is_sitemap(helper.httpRequestGetContent(url))
print('Is sitemap: {0}'.format(is_sitemap))
output_file += '{0}, {1}, {2}\n'.format(url.replace('\n', ''), status_code, is_sitemap)
elif test_regime == 'urlHarvest':
"""
Fetches URLs from a page's content
"""
i = 0
print('Harvesting URLs from {0}'.format(url))
try:
for found_url in helper.fetchUrlsFromPage(url, 50):
output_file += '{0}\n'.format(found_url)
i+=1
except:
print('Error! The URL {0} failed.'.format(url))
pass
#print('Found {0} URLs from {1}'.format(i,url))
elif test_regime == 'googlePageSpeed':
check_page = check_lighthouse(url)
if bool(check_page):
print('{0} has been checked against Google Pagespeed API'.format(
mess_to_console))
for key in check_page:
output_file = output_file + '{0},{1},{2}\n'.format(url, key,
check_page[
key])
elif test_regime == 'mobileFriendlyCheck':
print(url)
status_message = test.mobileFriendlyCheck(url,
privatekeys.googleMobileFriendlyApiKey)
print(
"Mobile-friendliness of URL '{0}' were evaluated as: {1}".format(url,
status_message))
output_file += '{0}, {1}\n'.format(url.replace('\n', ''), status_message)
sleep(time_to_sleep_in_seconds) # sleeping for n seconds
elif test_regime == 'contentCheck':
print("{0}. Checking content of URL '{1}'.".format(i, url))
for key, value in content_check(url).items():
output_file = output_file + '{0},{1},{2}\n'.format(url, key, value)
i = i + 1
elif test_regime == 'findString':
searching = find_string('piwik', url)
print("{0}. Checking for string in URL '{1}' - {2}".format(i, url, searching))
output_file = output_file + '{0},{1}\n'.format(url, searching)
i = i + 1
# sleep(time_to_sleep_in_seconds) # sleeping for n seconds
urlsInTextfile.append(url)
iteration_counter += 1
f.close()
### Writing the report
file_name = 'rapporter/{0}_{1}_{2}.csv'.format(str(datetime.today())[:10],
test_regime, helper.getUniqueId())
helper.writeFile(file_name, output_file)
print('The report has now been written to a file named: {0}'.format(file_name))
def oneOffFromSitemap(url_to_sitemap, check_limit,
date_limit, naming, test_regime):
"""Initially only checks a site against Google Pagespeed API
"""
urls = helper.fetchUrlsFromSitemap(url_to_sitemap, date_limit)
i = 1
output_file = ''
for url in urls:
mess_to_console = '{0}. {1}'.format(i, url[1])
if i > check_limit:
break
try:
if test_regime == 'googlePageSpeed':
check_page = check_lighthouse(url[1])
if bool(check_page):
print('{0} has been checked against Google Pagespeed API'.format(
mess_to_console))
for key in check_page:
output_file = output_file + '{0},{1},{2}\n'.format(url[1], key,
check_page[
key])
i = i + 1
elif test_regime == 'httpStatusCodeCheck':
status_code = test.httpStatusCodeCheck(url[1], False)
print('{0}. {01} has a status code: {2}'.format(i, mess_to_console, status_code))
output_file += '{0}, {1}\n'.format(url[1].replace('\n', ''), status_code)
i = i + 1
elif test_regime == 'mobileFriendlyCheck':
status_message = test.mobileFriendlyCheck(url[1],
privatekeys.googleMobileFriendlyApiKey)
print("{0}. Mobile-friendliness of URL '{1}' were evaluated as: {2}".format(i,
url[1], status_message))
output_file += '{0}, {1}\n'.format(url[1].replace('\n', ''),
status_message)
i = i + 1
elif test_regime == 'thirdPartiesCheck':
status_message = test.thirdPartiesCheck(url[1])
print("{0}. Third parties of URL '{1}' were evaluated as: {2}".format(i, url[1], status_message))
output_file += '{0}, {1}\n'.format(url[1].replace('\n', ''), status_message)
i = i + 1
elif test_regime == 'contentCheck':
print("{0}. Checking content of URL '{1}'.".format(i, url[1]))
for key, value in content_check(url[1]).items():
output_file = output_file + '{0},{1},{2}\n'.format(url[1], key, value)
i = i + 1
except:
print('Error! The request for URL "{0}" failed.\nMessage:\n{2}'.format(url[1], sys.exc_info()))
pass
i = i + 1
# Writing the report
file_name = 'rapporter/{0}_{1}_{2}.csv'.format(str(datetime.today())[:10], naming,
helper.getUniqueId())
helper.writeFile(file_name, output_file)
print('Report written to disk at {0}'.format(file_name))
# supposed to support scheduling from bash-scripts or hosts such as PythonAnywhere
def checkSitemapsForNewUrls(file, check_limit, date_limit, test_regime):
"""
Checking a list of predefined sitemaps for new or updated URLs
Attributes: string file (for the file location on disk)
"""
f = open(file, 'r')
for line in f:
sitemap = line.replace('\n', '')
sitemap_friendly_name = sitemap.replace('http://', '').replace('https://', '').replace('/', '-')
print('\nInitiating check of sitemap: {0}'.format(sitemap))
oneOffFromSitemap(sitemap, check_limit,
date_limit, '{0}-{1}'.format(test_regime, sitemap_friendly_name), test_regime)
# iterera runt de URLar som finns och anropa sitemaps
# kolla om det finns material som är mindre än 14 dagar gammalt (i slutändan kör man denna dagligen per sajt, typ)
# bygga ett register över URLars ålder
# om ny URL hittas så läggs den i en textfil som kollas på i slutet av exekveringen
"""
If file is executed on itself then call on a definition
"""
if __name__ == '__main__':
#oneOffProcess('exempelfiler/2020-03-18-covidtest.txt', 'googlePageSpeed')
oneOffFromSitemap('https://www.vgregion.se/sitemap.xml', 10,
'2019-02-17T06:19:00+01:00', 'googlePageSpeed', 'googlePageSpeed')
# checkSitemapsForNewUrls('exempelfiler/sitemaps.txt')
# checkSitemapsForNewUrls('exempelfiler/sitemaps.txt', check_limit=99999, date_limit='2017-08-01T06:19:00+01:00', test_regime='contentCheck')
# for key, value in content_check('http://webbstrategiforalla.se/konferenser/').items():
# print("{key}: {value}".format(key=key, value=value))