-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDSJobsScraper.py
230 lines (167 loc) · 6.25 KB
/
DSJobsScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
o BeautifulSoup: https://www.analyticsvidhya.com/blog/2015/10/beginner-guide-web-scraping-beautiful-soup-python/
Open SQL DB
Read Jobs from Job DB
Write Jobs to DB
Use Decision tree model to analyze Jobs
2DO
o Schedule day run
o Use IP Switcher Stem: https://dm295.blogspot.co.at/2016/02/tor-ip-changing-and-web-scraping.html?m=1
o NLP Textblob: http://textblob.readthedocs.io/en/dev/quickstart.html | https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/
'''
import requests
import re
from bs4 import BeautifulSoup
from DSJobsDB import Job
import time
import datetime
import warnings
#import cgi
#import cgitb; cgitb.enable() # for troubleshooting
from stem import Signal
from stem.control import Controller
import json
keywords = []
def load_keywords():
global keywords
with open('data/search_keywords.json', 'r') as json_file:
keywords = json.load(json_file)
#https://stackoverflow.com/questions/30286293/make-requests-using-python-over-tor
def get_tor_session():
renew_connection()
session = requests.session()
session.proxies = {'http': 'socks5://127.0.0.1:9050',
'https': 'socks5://127.0.0.1:9050'}
# print(session.get("http://httpbin.org/ip").text)
return session
def renew_connection():
with Controller.from_port(port = 9051) as controller:
controller.authenticate(password='my password')
controller.signal(Signal.NEWNYM)
def GetWebPage(url):
'''
Query the html page and return a beautiful soup object.
'''
#query the website
#page = requests.get(url)
session = get_tor_session()
page = session.get(url)
#create nice html file
soup=BeautifulSoup(page.content)#, "html5lib")#,"lxml")
return soup
def GetKarriereAtJobDetail(url_):
'''
follow the link to the detailed job description and return the information.
'''
#page = requests.get(url_)
session = get_tor_session()
page = session.get(url_)
x0_ = str(page.content)
x1_ = x0_.find("application/ld+json")
x2_ = x0_[x1_+20:].find("</script")
return x0_[x1_+21:x1_+x2_+20]
def GetKarriereAtJob(job):
'''
use keyphrases to generate a URL. Walk though the list of jobs and return their
url, description, id and detailed job desciption.
If the last job on the page was found, use the same keyword, but increase the
pagecount by 1. if the first job on the new page is different from the first job
of the last page, get all jobs from the page. Otherwise break the loop.
'''
searchString_ = 'https://www.karriere.at/jobs?keywords='
PrintToHtml('Attempt to read: ' + searchString_ + job)
src = 'www.karriere.at'
a=1
webData = GetWebPage(searchString_ + job + '&page=' + str(a))
#PrintToHtml(str(webData))
src_=[]
url_=[]
job_=[]
id_=[]
when_=[]
company_=[]
jobDetail_=[]
#Browse all pages on Karriere.at containing the search string, until the last page was found
jobFirstID_ = ''
jobExit_ = 0
requestLimit_ = 50
countRequest_ = 0
while (jobExit_ == 0):
PrintToHtml('Read jobs: '+str(job) +', page: '+str(a))
jobFirst_ = 0
for div in webData.find_all('div', attrs={'class':'m-jobItem__dataContainer'}):
src_.append(src)
url_.append(div.find('a')['href'])
job_.append(div.find('a').contents[0])
id_.append(div.find('a')['href'][-7:])
#Get the Job Details from new URSL
jobDetail_.append(GetKarriereAtJobDetail(div.find('a')['href']))
countRequest_ += 1
if (countRequest_ >= requestLimit_):
SwitchIp()
countRequest_ = 0
if (jobFirst_ == 0):
JobAgain_ = str(div.find('a')['href'][-7:])
if (str(JobAgain_) == str(jobFirstID_)):
jobExit_ = 1
else:
jobFirstID_ = div.find('a')['href'][-7:]
jobFirst_ += 1
for div in webData.find_all(class_=re.compile("m-jobItem__date")):
when_.append(div.contents[0])
for div in webData.find_all(class_=re.compile("m-jobItem__company")):
company_.append(div.contents[0])
a += 1
if a > 25:
jobExit_ = 1
PrintToHtml('Search String '+searchString_ + job + '&page=' + str(a)+' provides no result')
webData = GetWebPage(searchString_ + job + '&page=' + str(a))
#Write all elemts of the List items to the database
#with Job() as db:
#db = Job()
with Job() as db:
a = 0
while a < len(id_):
#print('Save Job: '+str(a)+' ID: '+id_[a])
db.writeJob(src_[a], job, id_[a], url_[a],job_[a],when_[a],company_[a])
db.writeJobDetail(src_[a],id_[a],jobDetail_[a],'')#, JobDetailClean(jobDetail_[a]))
a += 1
PrintToHtml(str(a) + ' Jobs updated sucessfully')
def PrintToHtml(message):
st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
with open('debug.html', 'a') as myfile:
myfile.write('<p>' +str(st) + " | " + message + '</p>')
#myfile.close()
def SwitchIp():
'''
echo "ControlPort 9051" >> /etc/tor/torrc
'''
with Controller.from_port(port=9051) as controller:
controller.authenticate()
controller.signal(Signal.NEWNYM)
def UpdateKarriereAt():
'''
Select keyphrases to put into the searchengine "www.karriere.at"
'''
PrintToHtml("Start UpdateKarriereAt")
for keyword in keywords:
GetKarriereAtJob(keyword)
PrintToHtml("End UpdateKarriereAt")
#5089767
def main():
'''
update the database
'''
warnings.simplefilter("ignore", DeprecationWarning)
load_keywords()
with open('debug.html','w') as myfile:
myfile.write("""<html>
<head></head>
<body><p></p>{htmlText}</body>
</html>""")
#print('Hello World')
UpdateKarriereAt()
if __name__ == "__main__":
main()