-
Notifications
You must be signed in to change notification settings - Fork 490
/
Copy pathJobScraper.py
111 lines (85 loc) · 3.69 KB
/
JobScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import urllib
import time
import requests
import re
import csv
from bs4 import BeautifulSoup
def write_csv(loc, info):
"""
The function writes the job openings collected in a .csv file
"""
headers = ["Title", "Company Name", "Location", "Date", "Summary", "Url"]
# Adding info into the rows of the file
with open(loc + "_openings.csv", "a", encoding="utf-8") as csv_f:
csv_p = csv.writer(csv_f, delimiter=",")
csv_p.writerow(headers)
csv_p.writerows(info)
print(f"\n{loc}_openings.csv has been saved to your directory!\n")
def job_scraper():
"""
The function scrapes the required number of job openings posted for a given job title and location
and stores all the associated information in a .csv file
"""
title = input("\nEnter job title: ").replace(" ", "+")
loc = input("Enter job location: ").replace(" ", "+")
num = int(input("Enter the number of job openings to obtain: "))
url = f"https://in.indeed.com/jobs?q={title}&l={loc}"
req_page = requests.get(url)
job_array = []
if req_page.status_code == 200:
soup = BeautifulSoup(req_page.text, "html.parser")
job_table = soup.find("td", id="resultsCol")
count = 0
flag = 1
while flag:
for job_card in job_table.find_all("div", class_="jobsearch-SerpJobCard"):
# Getting the job title
title_elem = job_card.find("a", class_="jobtitle turnstileLink")
title = title_elem.text.strip()
# Getting the company name
company_details = job_card.find("div", class_="sjcl")
company_name = company_details.find("span", class_="company")
company_name = company_name.text.strip()
# Getting the company location
company_loc = company_details.find("span", class_="location")
if company_loc is not None:
company_loc = company_loc.text.strip()
else:
company_loc = loc
# Getting the URL of the post
link = job_card.find("a")["href"]
link = "https://in.indeed.com" + link
# Getting the date of the post
date_elem = job_card.find("span", class_="date")
date = date_elem.text.strip()
# Getting the job summary
summary_ele = job_card.findAll("div", attrs={"class": "summary"})
for span in summary_ele:
span = span.text.strip()
count += 1
job_array.append([title, company_name, company_loc, date, span, link])
if count == num:
flag = 0
break
# To go to the next page
page = soup.find("ul", class_="pagination-list")
found = 0
for page in page.find_all("a"):
if page.attrs["aria-label"] == "Next":
found = 1
break
if found:
next_page_link = "https://in.indeed.com" + page.attrs["href"]
time.sleep(2)
req_page = requests.get(next_page_link)
soup = BeautifulSoup(req_page.text, "html.parser")
job_table = soup.find("td", id="resultsCol")
else:
flag = 0
write_csv(loc, job_array)
else:
print(
"There seems to be a problem fetching the results. Check your inputs, connections and try again"
)
if __name__ == "__main__":
job_scraper()