Python/Job_Scraper/JobScraper.py

import urllib
import time
import requests
import re
import csv
from bs4 import BeautifulSoup


def write_csv(loc, info):
    """
    The function writes the job openings collected in a .csv file
    """
    headers = ["Title", "Company Name", "Location", "Date", "Summary", "Url"]

    # Adding info into the rows of the file
    with open(loc + "_openings.csv", "a", encoding="utf-8") as csv_f:
        csv_p = csv.writer(csv_f, delimiter=",")
        csv_p.writerow(headers)
        csv_p.writerows(info)

    print(f"\n{loc}_openings.csv has been saved to your directory!\n")


def job_scraper():
    """
    The function scrapes the required number of job openings posted for a given job title and location
    and stores all the associated information in a .csv file
    """
    title = input("\nEnter job title: ").replace(" ", "+")
    loc = input("Enter job location: ").replace(" ", "+")
    num = int(input("Enter the number of job openings to obtain: "))

    url = f"https://in.indeed.com/jobs?q={title}&l={loc}"
    req_page = requests.get(url)

    job_array = []

    if req_page.status_code == 200:
        soup = BeautifulSoup(req_page.text, "html.parser")
        job_table = soup.find("td", id="resultsCol")
        count = 0

        flag = 1
        while flag:
            for job_card in job_table.find_all("div", class_="jobsearch-SerpJobCard"):
                # Getting the job title
                title_elem = job_card.find("a", class_="jobtitle turnstileLink")
                title = title_elem.text.strip()

                # Getting the company name
                company_details = job_card.find("div", class_="sjcl")
                company_name = company_details.find("span", class_="company")
                company_name = company_name.text.strip()

                # Getting the company location
                company_loc = company_details.find("span", class_="location")
                if company_loc is not None:
                    company_loc = company_loc.text.strip()
                else:
                    company_loc = loc

                # Getting the URL of the post
                link = job_card.find("a")["href"]
                link = "https://in.indeed.com" + link

                # Getting the date of the post
                date_elem = job_card.find("span", class_="date")
                date = date_elem.text.strip()

                # Getting the job summary
                summary_ele = job_card.findAll("div", attrs={"class": "summary"})
                for span in summary_ele:
                    span = span.text.strip()

                count += 1

                job_array.append([title, company_name, company_loc, date, span, link])
                if count == num:
                    flag = 0
                    break

            # To go to the next page
            page = soup.find("ul", class_="pagination-list")
            found = 0
            for page in page.find_all("a"):
                if page.attrs["aria-label"] == "Next":
                    found = 1
                    break

            if found:
                next_page_link = "https://in.indeed.com" + page.attrs["href"]

                time.sleep(2)

                req_page = requests.get(next_page_link)
                soup = BeautifulSoup(req_page.text, "html.parser")
                job_table = soup.find("td", id="resultsCol")

            else:
                flag = 0

        write_csv(loc, job_array)

    else:
        print(
            "There seems to be a problem fetching the results. Check your inputs, connections and try again"
        )


if __name__ == "__main__":
    job_scraper()