From 86eecae74f0dca685336e0c044280f1c1c13f07d Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Wed, 18 Oct 2023 15:29:17 -0500 Subject: [PATCH 01/48] Fixed csv output --- scrapers/craigslist.py | 57 ++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py index cb6ad22..b668f88 100644 --- a/scrapers/craigslist.py +++ b/scrapers/craigslist.py @@ -10,6 +10,10 @@ # Add more locations and their batch values as needed } +def clean_price_str(str): + price_str = str.replace("$", "").replace(",", "") + return float(price_str) + def fetch_job_postings(location, category): base_url = "https://sapi.craigslist.org/web/v8/postings/search/full" @@ -23,7 +27,7 @@ def fetch_job_postings(location, category): 'lang': 'en', 'searchPath': "cta", "id": "0", - "collectContactInfo": True, + "collectContactInfo": True, } headers = { @@ -39,27 +43,35 @@ def fetch_job_postings(location, category): if response.status_code == 200: data = response.json() + + with open('file.txt', 'w') as f: + json.dump(data["data"]["items"], f, indent=2) else: print("Failed to retrieve data. Status code:", response.status_code) data = None - job_postings = [] - with open('file.txt', 'w') as f: - json.dump(data, f, indent=2) + car_posts = [] if data: - for item in data["data"]["items"]: - job_title = None - commission = None - for element in item: + # For each car post found + for post in data["data"]["items"]: + title = None + price = None + mileage = None + partial_link = None + + for element in post: if isinstance(element, str): - job_title = element - elif isinstance(element, list) and len(element) > 0 and element[0] == 7: - commission = element[1] - if job_title and commission: - job_postings.append((job_title, commission)) - return job_postings - + title = element + elif isinstance(element, list) and len(element) > 0 and element[0] == 10: + price = clean_price_str(element[1]) + elif isinstance(element, list) and len(element) > 0 and element[0] == 9: + mileage = element[1] + elif isinstance(element, list) and len(element) > 0 and element[0] == 6: + partial_link = element[1] + if title and price and mileage and partial_link: + car_posts.append((title, price, mileage, partial_link)) + return car_posts else: print("No data available.") @@ -67,9 +79,10 @@ def fetch_job_postings(location, category): location = "dallas" category = "cta" - job_postings = fetch_job_postings(location, category) + car_posts = fetch_job_postings(location, category) - if job_postings: + if car_posts: + print("we have results") current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S") category = category.replace("/", "&") csv_filename = f"{location}_{category}_openings_{current_datetime}.csv" @@ -77,10 +90,10 @@ def fetch_job_postings(location, category): with open(csv_filename, mode='w', newline='', encoding='utf-8') as file: writer = csv.writer(file) - writer.writerow(["Job Title", "Commission"]) - for job in job_postings: - writer.writerow([job[0], job[1]]) + writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"]) + for car in car_posts: + writer.writerow([car[0], car[1], car[2], car[3]]) - print(f"Job postings have been saved to {csv_filename}") + print(f"Car posts have been saved to {csv_filename}") else: - print("No data available.") \ No newline at end of file + print(car_posts) \ No newline at end of file From cb9bbeddbb49b7d2030fafdb75e8d045e60389ba Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Wed, 18 Oct 2023 15:30:39 -0500 Subject: [PATCH 02/48] removed debug print statements --- scrapers/craigslist.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py index b668f88..1989d9a 100644 --- a/scrapers/craigslist.py +++ b/scrapers/craigslist.py @@ -82,7 +82,6 @@ def fetch_job_postings(location, category): car_posts = fetch_job_postings(location, category) if car_posts: - print("we have results") current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S") category = category.replace("/", "&") csv_filename = f"{location}_{category}_openings_{current_datetime}.csv" @@ -96,4 +95,4 @@ def fetch_job_postings(location, category): print(f"Car posts have been saved to {csv_filename}") else: - print(car_posts) \ No newline at end of file + print("No car posts were found. Nothing was saved") \ No newline at end of file From 8255e4bc7838dd62eb7b97fc22686eced7e309d5 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 26 Oct 2023 16:47:13 -0500 Subject: [PATCH 03/48] moved scrapers into src dir --- {scrapers => src/scrapers}/craigslist.py | 0 {scrapers => src/scrapers}/facebook.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {scrapers => src/scrapers}/craigslist.py (100%) rename {scrapers => src/scrapers}/facebook.py (100%) diff --git a/scrapers/craigslist.py b/src/scrapers/craigslist.py similarity index 100% rename from scrapers/craigslist.py rename to src/scrapers/craigslist.py diff --git a/scrapers/facebook.py b/src/scrapers/facebook.py similarity index 100% rename from scrapers/facebook.py rename to src/scrapers/facebook.py From 880cc97e75800cb1a5765010f9d9973aca1d7143 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 26 Oct 2023 16:49:20 -0500 Subject: [PATCH 04/48] renamed craigslist to craigslist-api --- src/scrapers/craigslist-api.py | 98 ++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 src/scrapers/craigslist-api.py diff --git a/src/scrapers/craigslist-api.py b/src/scrapers/craigslist-api.py new file mode 100644 index 0000000..1989d9a --- /dev/null +++ b/src/scrapers/craigslist-api.py @@ -0,0 +1,98 @@ +from datetime import datetime +import csv +import json +import requests + +location_to_batch = { + "newyork": "3-0-360-0-0", + "philadelphia": "17-0-360-0-0", + "dallas": "21-0-360-0-0", + # Add more locations and their batch values as needed +} + +def clean_price_str(str): + price_str = str.replace("$", "").replace(",", "") + return float(price_str) + +def fetch_job_postings(location, category): + base_url = "https://sapi.craigslist.org/web/v8/postings/search/full" + + # Get the batch value and category abbreviation from the mappings + # Default to New York if location not found + batch = location_to_batch.get(location) + + params = { + 'batch': batch, + 'cc': 'US', + 'lang': 'en', + 'searchPath': "cta", + "id": "0", + "collectContactInfo": True, + } + + headers = { + 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', + 'Referer': f'https://{location}.craigslist.org/', + 'sec-ch-ua-mobile': '?0', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', + 'sec-ch-ua-platform': '"Windows"', + 'Cookie': f'cl_b=COOKIE VALUE' + } + + response = requests.get(base_url, params=params, headers=headers) + + if response.status_code == 200: + data = response.json() + + with open('file.txt', 'w') as f: + json.dump(data["data"]["items"], f, indent=2) + else: + print("Failed to retrieve data. Status code:", response.status_code) + data = None + + + car_posts = [] + if data: + # For each car post found + for post in data["data"]["items"]: + title = None + price = None + mileage = None + partial_link = None + + for element in post: + if isinstance(element, str): + title = element + elif isinstance(element, list) and len(element) > 0 and element[0] == 10: + price = clean_price_str(element[1]) + elif isinstance(element, list) and len(element) > 0 and element[0] == 9: + mileage = element[1] + elif isinstance(element, list) and len(element) > 0 and element[0] == 6: + partial_link = element[1] + if title and price and mileage and partial_link: + car_posts.append((title, price, mileage, partial_link)) + return car_posts + else: + print("No data available.") + +if __name__ == "__main__": + location = "dallas" + category = "cta" + + car_posts = fetch_job_postings(location, category) + + if car_posts: + current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S") + category = category.replace("/", "&") + csv_filename = f"{location}_{category}_openings_{current_datetime}.csv" + + with open(csv_filename, mode='w', newline='', encoding='utf-8') as file: + writer = csv.writer(file) + + writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"]) + for car in car_posts: + writer.writerow([car[0], car[1], car[2], car[3]]) + + print(f"Car posts have been saved to {csv_filename}") + else: + print("No car posts were found. Nothing was saved") \ No newline at end of file From 58cf7ad512869266238e6f25d34d9961ba60ddea Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 26 Oct 2023 19:24:23 -0500 Subject: [PATCH 05/48] craigslist scraper collects image data --- src/scrapers/craigslist.py | 218 ++++++++++++++++++++----------------- 1 file changed, 120 insertions(+), 98 deletions(-) diff --git a/src/scrapers/craigslist.py b/src/scrapers/craigslist.py index 1989d9a..cbd06c5 100644 --- a/src/scrapers/craigslist.py +++ b/src/scrapers/craigslist.py @@ -1,98 +1,120 @@ -from datetime import datetime -import csv -import json -import requests - -location_to_batch = { - "newyork": "3-0-360-0-0", - "philadelphia": "17-0-360-0-0", - "dallas": "21-0-360-0-0", - # Add more locations and their batch values as needed -} - -def clean_price_str(str): - price_str = str.replace("$", "").replace(",", "") - return float(price_str) - -def fetch_job_postings(location, category): - base_url = "https://sapi.craigslist.org/web/v8/postings/search/full" - - # Get the batch value and category abbreviation from the mappings - # Default to New York if location not found - batch = location_to_batch.get(location) - - params = { - 'batch': batch, - 'cc': 'US', - 'lang': 'en', - 'searchPath': "cta", - "id": "0", - "collectContactInfo": True, - } - - headers = { - 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', - 'Referer': f'https://{location}.craigslist.org/', - 'sec-ch-ua-mobile': '?0', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', - 'sec-ch-ua-platform': '"Windows"', - 'Cookie': f'cl_b=COOKIE VALUE' - } - - response = requests.get(base_url, params=params, headers=headers) - - if response.status_code == 200: - data = response.json() - - with open('file.txt', 'w') as f: - json.dump(data["data"]["items"], f, indent=2) - else: - print("Failed to retrieve data. Status code:", response.status_code) - data = None - - - car_posts = [] - if data: - # For each car post found - for post in data["data"]["items"]: - title = None - price = None - mileage = None - partial_link = None - - for element in post: - if isinstance(element, str): - title = element - elif isinstance(element, list) and len(element) > 0 and element[0] == 10: - price = clean_price_str(element[1]) - elif isinstance(element, list) and len(element) > 0 and element[0] == 9: - mileage = element[1] - elif isinstance(element, list) and len(element) > 0 and element[0] == 6: - partial_link = element[1] - if title and price and mileage and partial_link: - car_posts.append((title, price, mileage, partial_link)) - return car_posts - else: - print("No data available.") - -if __name__ == "__main__": - location = "dallas" - category = "cta" - - car_posts = fetch_job_postings(location, category) - - if car_posts: - current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S") - category = category.replace("/", "&") - csv_filename = f"{location}_{category}_openings_{current_datetime}.csv" - - with open(csv_filename, mode='w', newline='', encoding='utf-8') as file: - writer = csv.writer(file) - - writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"]) - for car in car_posts: - writer.writerow([car[0], car[1], car[2], car[3]]) - - print(f"Car posts have been saved to {csv_filename}") - else: - print("No car posts were found. Nothing was saved") \ No newline at end of file +from selenium import webdriver +from bs4 import BeautifulSoup +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.action_chains import ActionChains +import time +from datetime import date + +def scrollTo(x, driver): + driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") + +def loadPageResources(driver): + scroll = 100 + + print("Waiting to load...") + time.sleep(2) + + scrollTo(scroll, driver) + + loadImgButtons = driver.find_elements("class name", "slider-back-arrow") + + time.sleep(2) + + # Emulate a user scrolling + for i in range(len(loadImgButtons)): + scroll += 100 + scrollTo(scroll, driver) + + driver.execute_script("arguments[0].click();", loadImgButtons[i]) + + time.sleep(.5) + + +def setupURLs(): + #list of cities to scrape; can be expanded + cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"] + + oldestAllowedCars = 2011 + + # Set the URL of the Facebook Marketplace automotive category + base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0' + return [base_url.format(city, oldestAllowedCars) for city in cities] + +def setupBrowser(): + print("Setting up headless browser") + + options = Options() + # options.add_argument("--headless=new") + + print("Creating a new Selenium WebDriver instance") + return webdriver.Chrome(options=options) + +def getAllPosts(browser): + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(browser.page_source, 'html.parser') + + # Find all of the car listings on the page + return soup.find_all('div', class_='gallery-card') + +def getCarImages(): + return "TODO" + +def scrapeCarInfo(post): + title = post.find('span', class_='label').text + + print(f'Scraping "{title}"') + + price = post.find('span', class_='priceinfo').text + metadata = post.find('div', class_="meta").text.split('·') + + miles = metadata[1] + if (len(metadata) >= 3): + location = metadata[2] + + link = post.find('a', class_='posting-title', href=True)["href"] + + imageElements = post.findAll('img') + images = [img["src"] for img in imageElements] + + return { + "title": title, + "price": price, + "location": location, + "miles": miles, + "link": link, + "images": images, + "scrapeDate": date.today() + } + +def scrapeCraigslist(): + cityURLs = setupURLs() + browser = setupBrowser() + + # Create a list to store the scraped data + print("Started scraping...") + + for url in cityURLs: + # Navigate to the URL + print(f"Going to {url}") + browser.get(url) + + print(f"Loading cars from {url}") + + loadPageResources(browser) + + carPosts = getAllPosts(browser) + + # Iterate over the listings and scrape the data + for post in carPosts: + try: + car = scrapeCarInfo(post) + print(car) + except: + print("Incomplete listing info") + + # Close the Selenium WebDriver instance + browser.quit() + +if (__name__ == "__main__"): + scrapeCraigslist() \ No newline at end of file From 59455e41c3a3be0a50de7d4dbd9f92cce256e64b Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 26 Oct 2023 20:12:20 -0500 Subject: [PATCH 06/48] nested craigslist homepage and listing scrapers in their own folder --- src/scrapers/{craigslist.py => craigslist/homepage.py} | 4 ++-- src/scrapers/craigslist/listing.py | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename src/scrapers/{craigslist.py => craigslist/homepage.py} (95%) create mode 100644 src/scrapers/craigslist/listing.py diff --git a/src/scrapers/craigslist.py b/src/scrapers/craigslist/homepage.py similarity index 95% rename from src/scrapers/craigslist.py rename to src/scrapers/craigslist/homepage.py index cbd06c5..efdd550 100644 --- a/src/scrapers/craigslist.py +++ b/src/scrapers/craigslist/homepage.py @@ -87,7 +87,7 @@ def scrapeCarInfo(post): "scrapeDate": date.today() } -def scrapeCraigslist(): +def scrapeHomepage(): cityURLs = setupURLs() browser = setupBrowser() @@ -117,4 +117,4 @@ def scrapeCraigslist(): browser.quit() if (__name__ == "__main__"): - scrapeCraigslist() \ No newline at end of file + scrapeHomepage() \ No newline at end of file diff --git a/src/scrapers/craigslist/listing.py b/src/scrapers/craigslist/listing.py new file mode 100644 index 0000000..e69de29 From 403e5798a4b0b66eb50df29b6a8ceb7b62a34d33 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 26 Oct 2023 21:54:00 -0500 Subject: [PATCH 07/48] set selenium in headless mode --- src/scrapers/craigslist/homepage.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/scrapers/craigslist/homepage.py b/src/scrapers/craigslist/homepage.py index efdd550..ad96903 100644 --- a/src/scrapers/craigslist/homepage.py +++ b/src/scrapers/craigslist/homepage.py @@ -44,7 +44,7 @@ def setupBrowser(): print("Setting up headless browser") options = Options() - # options.add_argument("--headless=new") + options.add_argument("--headless=new") print("Creating a new Selenium WebDriver instance") return webdriver.Chrome(options=options) @@ -52,7 +52,7 @@ def setupBrowser(): def getAllPosts(browser): # Create a BeautifulSoup object from the HTML of the page html = browser.page_source - soup = BeautifulSoup(browser.page_source, 'html.parser') + soup = BeautifulSoup(html, 'html.parser') # Find all of the car listings on the page return soup.find_all('div', class_='gallery-card') @@ -60,7 +60,7 @@ def getAllPosts(browser): def getCarImages(): return "TODO" -def scrapeCarInfo(post): +def getCarInfo(post): title = post.find('span', class_='label').text print(f'Scraping "{title}"') @@ -91,9 +91,6 @@ def scrapeHomepage(): cityURLs = setupURLs() browser = setupBrowser() - # Create a list to store the scraped data - print("Started scraping...") - for url in cityURLs: # Navigate to the URL print(f"Going to {url}") @@ -108,7 +105,7 @@ def scrapeHomepage(): # Iterate over the listings and scrape the data for post in carPosts: try: - car = scrapeCarInfo(post) + car = getCarInfo(post) print(car) except: print("Incomplete listing info") From 16adc908456a33953dd29552904962f962aa3380 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 26 Oct 2023 22:09:03 -0500 Subject: [PATCH 08/48] can scrape description and attributes of craigslist listing --- src/scrapers/craigslist/listing.py | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/scrapers/craigslist/listing.py b/src/scrapers/craigslist/listing.py index e69de29..4524184 100644 --- a/src/scrapers/craigslist/listing.py +++ b/src/scrapers/craigslist/listing.py @@ -0,0 +1,39 @@ +import time +from bs4 import BeautifulSoup +from homepage import setupBrowser + +def processAttributes(attributes): + processedAttributes = [] + + for attr in attributes: + [label, value] = attr.split(": ") + processedAttributes.append({"label": label, "value": value}) + + return processedAttributes + +def scrapeListing(url): + browser = setupBrowser() + + # Navigate to the URL + print(f"Going to {url}") + browser.get(url) + + print(f"Loading page for {url}") + time.sleep(1) + + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, 'html.parser') + + try: + description = soup.find('section', id='postingbody').text + attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) + + print([attributes, description]) + except: + print(f"Failed scraping {url}") + + # Close the Selenium WebDriver instance + browser.quit() + +scrapeListing("https://abilene.craigslist.org/ctd/d/abilene-hyundai-elantra/7681061021.html") \ No newline at end of file From e08ac194a899dbc4352321a0333ee4b9c95b25d9 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 26 Oct 2023 22:15:16 -0500 Subject: [PATCH 09/48] can scrape description and attributes of craigslist listing --- src/scrapers/craigslist/listing.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/scrapers/craigslist/listing.py b/src/scrapers/craigslist/listing.py index 4524184..4e22245 100644 --- a/src/scrapers/craigslist/listing.py +++ b/src/scrapers/craigslist/listing.py @@ -29,7 +29,11 @@ def scrapeListing(url): description = soup.find('section', id='postingbody').text attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) - print([attributes, description]) + map = soup.find('div', id='map') + longitude = map["data-longitude"] + latitude = map["data-latitude"] + + print([attributes, description, longitude, latitude]) except: print(f"Failed scraping {url}") From ef58802be059bf3d0c4b041b463a9928bd7ba1db Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 26 Oct 2023 22:27:02 -0500 Subject: [PATCH 10/48] replaced spaces with tabs --- src/scrapers/craigslist/homepage.py | 150 ++++++++++++++-------------- 1 file changed, 75 insertions(+), 75 deletions(-) diff --git a/src/scrapers/craigslist/homepage.py b/src/scrapers/craigslist/homepage.py index ad96903..a317838 100644 --- a/src/scrapers/craigslist/homepage.py +++ b/src/scrapers/craigslist/homepage.py @@ -6,112 +6,112 @@ from datetime import date def scrollTo(x, driver): - driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") + driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") def loadPageResources(driver): - scroll = 100 + scroll = 100 - print("Waiting to load...") - time.sleep(2) + print("Waiting to load...") + time.sleep(2) - scrollTo(scroll, driver) + scrollTo(scroll, driver) - loadImgButtons = driver.find_elements("class name", "slider-back-arrow") + loadImgButtons = driver.find_elements("class name", "slider-back-arrow") - time.sleep(2) + time.sleep(2) - # Emulate a user scrolling - for i in range(len(loadImgButtons)): - scroll += 100 - scrollTo(scroll, driver) + # Emulate a user scrolling + for i in range(len(loadImgButtons)): + scroll += 100 + scrollTo(scroll, driver) - driver.execute_script("arguments[0].click();", loadImgButtons[i]) + driver.execute_script("arguments[0].click();", loadImgButtons[i]) - time.sleep(.5) + time.sleep(.5) def setupURLs(): - #list of cities to scrape; can be expanded - cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"] + #list of cities to scrape; can be expanded + cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"] - oldestAllowedCars = 2011 + oldestAllowedCars = 2011 - # Set the URL of the Facebook Marketplace automotive category - base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0' - return [base_url.format(city, oldestAllowedCars) for city in cities] + # Set the URL of the Facebook Marketplace automotive category + base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0' + return [base_url.format(city, oldestAllowedCars) for city in cities] def setupBrowser(): - print("Setting up headless browser") + print("Setting up headless browser") - options = Options() - options.add_argument("--headless=new") + options = Options() + options.add_argument("--headless=new") - print("Creating a new Selenium WebDriver instance") - return webdriver.Chrome(options=options) + print("Creating a new Selenium WebDriver instance") + return webdriver.Chrome(options=options) def getAllPosts(browser): - # Create a BeautifulSoup object from the HTML of the page - html = browser.page_source - soup = BeautifulSoup(html, 'html.parser') + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, 'html.parser') - # Find all of the car listings on the page - return soup.find_all('div', class_='gallery-card') + # Find all of the car listings on the page + return soup.find_all('div', class_='gallery-card') def getCarImages(): - return "TODO" + return "TODO" def getCarInfo(post): - title = post.find('span', class_='label').text - - print(f'Scraping "{title}"') - - price = post.find('span', class_='priceinfo').text - metadata = post.find('div', class_="meta").text.split('·') - - miles = metadata[1] - if (len(metadata) >= 3): - location = metadata[2] - - link = post.find('a', class_='posting-title', href=True)["href"] - - imageElements = post.findAll('img') - images = [img["src"] for img in imageElements] - - return { - "title": title, - "price": price, - "location": location, - "miles": miles, - "link": link, - "images": images, - "scrapeDate": date.today() - } + title = post.find('span', class_='label').text + + print(f'Scraping "{title}"') + + price = post.find('span', class_='priceinfo').text + metadata = post.find('div', class_="meta").text.split('·') + + miles = metadata[1] + if (len(metadata) >= 3): + location = metadata[2] + + link = post.find('a', class_='posting-title', href=True)["href"] + + imageElements = post.findAll('img') + images = [img["src"] for img in imageElements] + + return { + "title": title, + "price": price, + "location": location, + "miles": miles, + "link": link, + "images": images, + "scrapeDate": date.today() + } def scrapeHomepage(): - cityURLs = setupURLs() - browser = setupBrowser() + cityURLs = setupURLs() + browser = setupBrowser() - for url in cityURLs: - # Navigate to the URL - print(f"Going to {url}") - browser.get(url) + for url in cityURLs: + # Navigate to the URL + print(f"Going to {url}") + browser.get(url) - print(f"Loading cars from {url}") + print(f"Loading cars from {url}") - loadPageResources(browser) + loadPageResources(browser) - carPosts = getAllPosts(browser) + carPosts = getAllPosts(browser) - # Iterate over the listings and scrape the data - for post in carPosts: - try: - car = getCarInfo(post) - print(car) - except: - print("Incomplete listing info") - - # Close the Selenium WebDriver instance - browser.quit() + # Iterate over the listings and scrape the data + for post in carPosts: + try: + car = getCarInfo(post) + print(car) + except: + print("Incomplete listing info") + + # Close the Selenium WebDriver instance + browser.quit() if (__name__ == "__main__"): - scrapeHomepage() \ No newline at end of file + scrapeHomepage() \ No newline at end of file From be1975ae809b2a38235ab4f58c4ee57fc82952a5 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Fri, 27 Oct 2023 10:24:55 -0500 Subject: [PATCH 11/48] updated miles label to odometer --- src/scrapers/craigslist/homepage.py | 7 ++++--- src/scrapers/craigslist/listing.py | 4 +--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/scrapers/craigslist/homepage.py b/src/scrapers/craigslist/homepage.py index a317838..7788c40 100644 --- a/src/scrapers/craigslist/homepage.py +++ b/src/scrapers/craigslist/homepage.py @@ -68,7 +68,7 @@ def getCarInfo(post): price = post.find('span', class_='priceinfo').text metadata = post.find('div', class_="meta").text.split('·') - miles = metadata[1] + odometer = metadata[1] if (len(metadata) >= 3): location = metadata[2] @@ -78,13 +78,14 @@ def getCarInfo(post): images = [img["src"] for img in imageElements] return { + "_id": link, "title": title, "price": price, "location": location, - "miles": miles, + "odometer": odometer, "link": link, "images": images, - "scrapeDate": date.today() + "scrapeDate": str(date.today()) } def scrapeHomepage(): diff --git a/src/scrapers/craigslist/listing.py b/src/scrapers/craigslist/listing.py index 4e22245..bd3a083 100644 --- a/src/scrapers/craigslist/listing.py +++ b/src/scrapers/craigslist/listing.py @@ -38,6 +38,4 @@ def scrapeListing(url): print(f"Failed scraping {url}") # Close the Selenium WebDriver instance - browser.quit() - -scrapeListing("https://abilene.craigslist.org/ctd/d/abilene-hyundai-elantra/7681061021.html") \ No newline at end of file + browser.quit() \ No newline at end of file From ea32b66292232768fd349cfbb79e15821d20a2c7 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Mon, 30 Oct 2023 18:14:12 -0500 Subject: [PATCH 12/48] added .pyc files to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index c6bba59..f725835 100644 --- a/.gitignore +++ b/.gitignore @@ -128,3 +128,6 @@ dist .yarn/build-state.yml .yarn/install-state.gz .pnp.* + +# python +*.pyc \ No newline at end of file From b3fe6af9ff895fbb7ef247a291dafba8e72ad5e1 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Mon, 30 Oct 2023 18:16:36 -0500 Subject: [PATCH 13/48] removed un-used import --- src/scrapers/craigslist/homepage.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/scrapers/craigslist/homepage.py b/src/scrapers/craigslist/homepage.py index 7788c40..482cf6d 100644 --- a/src/scrapers/craigslist/homepage.py +++ b/src/scrapers/craigslist/homepage.py @@ -1,7 +1,6 @@ from selenium import webdriver from bs4 import BeautifulSoup from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.action_chains import ActionChains import time from datetime import date @@ -31,7 +30,7 @@ def loadPageResources(driver): def setupURLs(): - #list of cities to scrape; can be expanded + # List of TX cities to scrape; can be expanded cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"] oldestAllowedCars = 2011 From 66ac3dd453c793a811e1b767081ffd07c6358ce8 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Mon, 30 Oct 2023 20:09:00 -0500 Subject: [PATCH 14/48] delete duplicate craigslist file --- src/scrapers/craigslist.py | 98 -------------------------------------- 1 file changed, 98 deletions(-) delete mode 100644 src/scrapers/craigslist.py diff --git a/src/scrapers/craigslist.py b/src/scrapers/craigslist.py deleted file mode 100644 index 1989d9a..0000000 --- a/src/scrapers/craigslist.py +++ /dev/null @@ -1,98 +0,0 @@ -from datetime import datetime -import csv -import json -import requests - -location_to_batch = { - "newyork": "3-0-360-0-0", - "philadelphia": "17-0-360-0-0", - "dallas": "21-0-360-0-0", - # Add more locations and their batch values as needed -} - -def clean_price_str(str): - price_str = str.replace("$", "").replace(",", "") - return float(price_str) - -def fetch_job_postings(location, category): - base_url = "https://sapi.craigslist.org/web/v8/postings/search/full" - - # Get the batch value and category abbreviation from the mappings - # Default to New York if location not found - batch = location_to_batch.get(location) - - params = { - 'batch': batch, - 'cc': 'US', - 'lang': 'en', - 'searchPath': "cta", - "id": "0", - "collectContactInfo": True, - } - - headers = { - 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', - 'Referer': f'https://{location}.craigslist.org/', - 'sec-ch-ua-mobile': '?0', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', - 'sec-ch-ua-platform': '"Windows"', - 'Cookie': f'cl_b=COOKIE VALUE' - } - - response = requests.get(base_url, params=params, headers=headers) - - if response.status_code == 200: - data = response.json() - - with open('file.txt', 'w') as f: - json.dump(data["data"]["items"], f, indent=2) - else: - print("Failed to retrieve data. Status code:", response.status_code) - data = None - - - car_posts = [] - if data: - # For each car post found - for post in data["data"]["items"]: - title = None - price = None - mileage = None - partial_link = None - - for element in post: - if isinstance(element, str): - title = element - elif isinstance(element, list) and len(element) > 0 and element[0] == 10: - price = clean_price_str(element[1]) - elif isinstance(element, list) and len(element) > 0 and element[0] == 9: - mileage = element[1] - elif isinstance(element, list) and len(element) > 0 and element[0] == 6: - partial_link = element[1] - if title and price and mileage and partial_link: - car_posts.append((title, price, mileage, partial_link)) - return car_posts - else: - print("No data available.") - -if __name__ == "__main__": - location = "dallas" - category = "cta" - - car_posts = fetch_job_postings(location, category) - - if car_posts: - current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S") - category = category.replace("/", "&") - csv_filename = f"{location}_{category}_openings_{current_datetime}.csv" - - with open(csv_filename, mode='w', newline='', encoding='utf-8') as file: - writer = csv.writer(file) - - writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"]) - for car in car_posts: - writer.writerow([car[0], car[1], car[2], car[3]]) - - print(f"Car posts have been saved to {csv_filename}") - else: - print("No car posts were found. Nothing was saved") \ No newline at end of file From 8d2acb4dc8b1ceed0049be407f3ff5231769c553 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Mon, 30 Oct 2023 20:11:44 -0500 Subject: [PATCH 15/48] moved scrapers dir to root --- {src/scrapers => scrapers}/craigslist-api.py | 0 {src/scrapers => scrapers}/craigslist/homepage.py | 0 {src/scrapers => scrapers}/craigslist/listing.py | 0 {src/scrapers => scrapers}/database.py | 0 {src/scrapers => scrapers}/facebook.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename {src/scrapers => scrapers}/craigslist-api.py (100%) rename {src/scrapers => scrapers}/craigslist/homepage.py (100%) rename {src/scrapers => scrapers}/craigslist/listing.py (100%) rename {src/scrapers => scrapers}/database.py (100%) rename {src/scrapers => scrapers}/facebook.py (100%) diff --git a/src/scrapers/craigslist-api.py b/scrapers/craigslist-api.py similarity index 100% rename from src/scrapers/craigslist-api.py rename to scrapers/craigslist-api.py diff --git a/src/scrapers/craigslist/homepage.py b/scrapers/craigslist/homepage.py similarity index 100% rename from src/scrapers/craigslist/homepage.py rename to scrapers/craigslist/homepage.py diff --git a/src/scrapers/craigslist/listing.py b/scrapers/craigslist/listing.py similarity index 100% rename from src/scrapers/craigslist/listing.py rename to scrapers/craigslist/listing.py diff --git a/src/scrapers/database.py b/scrapers/database.py similarity index 100% rename from src/scrapers/database.py rename to scrapers/database.py diff --git a/src/scrapers/facebook.py b/scrapers/facebook.py similarity index 100% rename from src/scrapers/facebook.py rename to scrapers/facebook.py From b4cc2f4c49e626bad01460cb1e2196c3b06b2bdf Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 10:45:45 -0500 Subject: [PATCH 16/48] Grouped homepage and listing files into one craigslist file --- .../{craigslist/homepage.py => craigslist.py} | 71 +++++++++---------- scrapers/craigslist/listing.py | 41 ----------- scrapers/scrapers.py | 42 +++++++++++ 3 files changed, 76 insertions(+), 78 deletions(-) rename scrapers/{craigslist/homepage.py => craigslist.py} (67%) delete mode 100644 scrapers/craigslist/listing.py create mode 100644 scrapers/scrapers.py diff --git a/scrapers/craigslist/homepage.py b/scrapers/craigslist.py similarity index 67% rename from scrapers/craigslist/homepage.py rename to scrapers/craigslist.py index 482cf6d..66bc8df 100644 --- a/scrapers/craigslist/homepage.py +++ b/scrapers/craigslist.py @@ -2,7 +2,6 @@ from bs4 import BeautifulSoup from selenium.webdriver.chrome.options import Options import time -from datetime import date def scrollTo(x, driver): driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") @@ -29,12 +28,10 @@ def loadPageResources(driver): time.sleep(.5) -def setupURLs(): +def setupURLs(oldestAllowedCars): # List of TX cities to scrape; can be expanded cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"] - oldestAllowedCars = 2011 - # Set the URL of the Facebook Marketplace automotive category base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0' return [base_url.format(city, oldestAllowedCars) for city in cities] @@ -76,42 +73,42 @@ def getCarInfo(post): imageElements = post.findAll('img') images = [img["src"] for img in imageElements] - return { - "_id": link, - "title": title, - "price": price, - "location": location, - "odometer": odometer, - "link": link, - "images": images, - "scrapeDate": str(date.today()) - } - -def scrapeHomepage(): - cityURLs = setupURLs() - browser = setupBrowser() + return title, price, location, odometer, link, images - for url in cityURLs: - # Navigate to the URL - print(f"Going to {url}") - browser.get(url) +def processAttributes(attributes): + processedAttributes = [] + + for attr in attributes: + [label, value] = attr.split(": ") + processedAttributes.append({"label": label, "value": value}) - print(f"Loading cars from {url}") + return processedAttributes - loadPageResources(browser) +def scrapeListing(url): + browser = setupBrowser() - carPosts = getAllPosts(browser) + # Navigate to the URL + print(f"Going to {url}") + browser.get(url) - # Iterate over the listings and scrape the data - for post in carPosts: - try: - car = getCarInfo(post) - print(car) - except: - print("Incomplete listing info") - - # Close the Selenium WebDriver instance - browser.quit() + print(f"Loading page for {url}") + time.sleep(1) -if (__name__ == "__main__"): - scrapeHomepage() \ No newline at end of file + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, 'html.parser') + + try: + description = soup.find('section', id='postingbody').text + attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) + + map = soup.find('div', id='map') + longitude = map["data-longitude"] + latitude = map["data-latitude"] + + print([attributes, description, longitude, latitude]) + except: + print(f"Failed scraping {url}") + + # Close the Selenium WebDriver instance + browser.quit() \ No newline at end of file diff --git a/scrapers/craigslist/listing.py b/scrapers/craigslist/listing.py deleted file mode 100644 index bd3a083..0000000 --- a/scrapers/craigslist/listing.py +++ /dev/null @@ -1,41 +0,0 @@ -import time -from bs4 import BeautifulSoup -from homepage import setupBrowser - -def processAttributes(attributes): - processedAttributes = [] - - for attr in attributes: - [label, value] = attr.split(": ") - processedAttributes.append({"label": label, "value": value}) - - return processedAttributes - -def scrapeListing(url): - browser = setupBrowser() - - # Navigate to the URL - print(f"Going to {url}") - browser.get(url) - - print(f"Loading page for {url}") - time.sleep(1) - - # Create a BeautifulSoup object from the HTML of the page - html = browser.page_source - soup = BeautifulSoup(html, 'html.parser') - - try: - description = soup.find('section', id='postingbody').text - attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) - - map = soup.find('div', id='map') - longitude = map["data-longitude"] - latitude = map["data-latitude"] - - print([attributes, description, longitude, latitude]) - except: - print(f"Failed scraping {url}") - - # Close the Selenium WebDriver instance - browser.quit() \ No newline at end of file diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py new file mode 100644 index 0000000..8c14383 --- /dev/null +++ b/scrapers/scrapers.py @@ -0,0 +1,42 @@ +import craigslist as cl +import database as db +from typing import Optional +import typer +from typing_extensions import Annotated + +app = typer.Typer() + +@app.command() +def craigslist(minYear: Annotated[Optional[int], typer.Argument()] = 2011): + cityURLs = cl.setupURLs(minYear) + browser = cl.setupBrowser() + + for url in cityURLs: + print(f"Going to {url}") + browser.get(url) + + print(f"Loading cars from {url}") + cl.loadPageResources(browser) + + carPosts = cl.getAllPosts(browser) + + for post in carPosts: + try: + title, price, location, odometer, link, images = cl.getCarInfo(post) + db.post_raw("craigslist", title, price, location, odometer, link, images) + except Exception as error: + print(error) + + browser.quit() + +@app.command() +def link(link: str): + if (".craigslist.org" in link): + cl.scrapeListing(link) + elif("https://www.facebook.com/marketplace" in link): + print("facebook marketplace") + else: + print("Not a Craigslist nor a Facebook Marketplace link") + +if __name__ == "__main__": + app() \ No newline at end of file From b2a1087d1cc376acab2d29995e999fc84f509417 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 10:46:32 -0500 Subject: [PATCH 17/48] Removed craigslist-api file --- scrapers/craigslist-api.py | 98 -------------------------------------- 1 file changed, 98 deletions(-) delete mode 100644 scrapers/craigslist-api.py diff --git a/scrapers/craigslist-api.py b/scrapers/craigslist-api.py deleted file mode 100644 index 1989d9a..0000000 --- a/scrapers/craigslist-api.py +++ /dev/null @@ -1,98 +0,0 @@ -from datetime import datetime -import csv -import json -import requests - -location_to_batch = { - "newyork": "3-0-360-0-0", - "philadelphia": "17-0-360-0-0", - "dallas": "21-0-360-0-0", - # Add more locations and their batch values as needed -} - -def clean_price_str(str): - price_str = str.replace("$", "").replace(",", "") - return float(price_str) - -def fetch_job_postings(location, category): - base_url = "https://sapi.craigslist.org/web/v8/postings/search/full" - - # Get the batch value and category abbreviation from the mappings - # Default to New York if location not found - batch = location_to_batch.get(location) - - params = { - 'batch': batch, - 'cc': 'US', - 'lang': 'en', - 'searchPath': "cta", - "id": "0", - "collectContactInfo": True, - } - - headers = { - 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', - 'Referer': f'https://{location}.craigslist.org/', - 'sec-ch-ua-mobile': '?0', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', - 'sec-ch-ua-platform': '"Windows"', - 'Cookie': f'cl_b=COOKIE VALUE' - } - - response = requests.get(base_url, params=params, headers=headers) - - if response.status_code == 200: - data = response.json() - - with open('file.txt', 'w') as f: - json.dump(data["data"]["items"], f, indent=2) - else: - print("Failed to retrieve data. Status code:", response.status_code) - data = None - - - car_posts = [] - if data: - # For each car post found - for post in data["data"]["items"]: - title = None - price = None - mileage = None - partial_link = None - - for element in post: - if isinstance(element, str): - title = element - elif isinstance(element, list) and len(element) > 0 and element[0] == 10: - price = clean_price_str(element[1]) - elif isinstance(element, list) and len(element) > 0 and element[0] == 9: - mileage = element[1] - elif isinstance(element, list) and len(element) > 0 and element[0] == 6: - partial_link = element[1] - if title and price and mileage and partial_link: - car_posts.append((title, price, mileage, partial_link)) - return car_posts - else: - print("No data available.") - -if __name__ == "__main__": - location = "dallas" - category = "cta" - - car_posts = fetch_job_postings(location, category) - - if car_posts: - current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S") - category = category.replace("/", "&") - csv_filename = f"{location}_{category}_openings_{current_datetime}.csv" - - with open(csv_filename, mode='w', newline='', encoding='utf-8') as file: - writer = csv.writer(file) - - writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"]) - for car in car_posts: - writer.writerow([car[0], car[1], car[2], car[3]]) - - print(f"Car posts have been saved to {csv_filename}") - else: - print("No car posts were found. Nothing was saved") \ No newline at end of file From f1c73723903cf6017ad757a64caa46913ed3148c Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 10:51:19 -0500 Subject: [PATCH 18/48] extracted utils from craigslist scraper --- scrapers/craigslist.py | 19 +++---------------- scrapers/scrapers.py | 11 +++++++---- scrapers/utils.py | 14 ++++++++++++++ 3 files changed, 24 insertions(+), 20 deletions(-) create mode 100644 scrapers/utils.py diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py index 66bc8df..ca37236 100644 --- a/scrapers/craigslist.py +++ b/scrapers/craigslist.py @@ -1,10 +1,6 @@ -from selenium import webdriver from bs4 import BeautifulSoup -from selenium.webdriver.chrome.options import Options import time - -def scrollTo(x, driver): - driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") +import utils def loadPageResources(driver): scroll = 100 @@ -12,7 +8,7 @@ def loadPageResources(driver): print("Waiting to load...") time.sleep(2) - scrollTo(scroll, driver) + utils.scrollTo(scroll, driver) loadImgButtons = driver.find_elements("class name", "slider-back-arrow") @@ -21,7 +17,7 @@ def loadPageResources(driver): # Emulate a user scrolling for i in range(len(loadImgButtons)): scroll += 100 - scrollTo(scroll, driver) + utils.scrollTo(scroll, driver) driver.execute_script("arguments[0].click();", loadImgButtons[i]) @@ -36,15 +32,6 @@ def setupURLs(oldestAllowedCars): base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0' return [base_url.format(city, oldestAllowedCars) for city in cities] -def setupBrowser(): - print("Setting up headless browser") - - options = Options() - options.add_argument("--headless=new") - - print("Creating a new Selenium WebDriver instance") - return webdriver.Chrome(options=options) - def getAllPosts(browser): # Create a BeautifulSoup object from the HTML of the page html = browser.page_source diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index 8c14383..e721237 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -1,15 +1,18 @@ -import craigslist as cl -import database as db from typing import Optional -import typer from typing_extensions import Annotated +import typer + +import craigslist as cl +import facebook as fb +import database as db +import utils app = typer.Typer() @app.command() def craigslist(minYear: Annotated[Optional[int], typer.Argument()] = 2011): cityURLs = cl.setupURLs(minYear) - browser = cl.setupBrowser() + browser = utils.setupBrowser() for url in cityURLs: print(f"Going to {url}") diff --git a/scrapers/utils.py b/scrapers/utils.py new file mode 100644 index 0000000..2a5a865 --- /dev/null +++ b/scrapers/utils.py @@ -0,0 +1,14 @@ +from selenium import webdriver +from selenium.webdriver.chrome.options import Options + +def scrollTo(x, driver): + driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") + +def setupBrowser(): + print("Setting up headless browser") + + options = Options() + options.add_argument("--headless=new") + + print("Creating a new Selenium WebDriver instance") + return webdriver.Chrome(options=options) \ No newline at end of file From 5e694dd9dea32e4d44ae818799220d9ec4c42c81 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 11:03:48 -0500 Subject: [PATCH 19/48] removed un-used function --- scrapers/craigslist.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py index ca37236..213cf45 100644 --- a/scrapers/craigslist.py +++ b/scrapers/craigslist.py @@ -40,9 +40,6 @@ def getAllPosts(browser): # Find all of the car listings on the page return soup.find_all('div', class_='gallery-card') -def getCarImages(): - return "TODO" - def getCarInfo(post): title = post.find('span', class_='label').text From 912006b0bc68d9804a4f668184800f3ad309e752 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 11:36:48 -0500 Subject: [PATCH 20/48] Updated facebook scraper --- scrapers/facebook.py | 206 ++++++++++++++++++++----------------------- scrapers/scrapers.py | 27 +++++- 2 files changed, 123 insertions(+), 110 deletions(-) diff --git a/scrapers/facebook.py b/scrapers/facebook.py index 00b5e45..5b8c43f 100644 --- a/scrapers/facebook.py +++ b/scrapers/facebook.py @@ -1,111 +1,101 @@ -from selenium import webdriver from bs4 import BeautifulSoup -from selenium.webdriver.chrome.options import Options import time +import utils -import database - - -#list of cities to scrape; can be expanded -cities = [ - 'nyc', 'la', 'chicago', 'houston', 'miami', - 'philadelphia', 'phoenix', 'sanantonio', 'sandiego', 'dallas', - 'sanjose', 'austin', 'jacksonville', 'fortworth', 'columbus', - 'charlotte', 'sanfrancisco', 'indianapolis', 'seattle', 'denver', - 'washington', 'boston', 'elpaso', 'nashville', 'detroit', 'portland', 'lasvegas', 'memphis', 'louisville', - 'baltimore', 'milwaukee', 'albuquerque', 'tucson', 'fresno', - 'kansascity', 'mesa', 'atlanta', - 'coloradosprings', 'virginiabeach', 'raleigh', 'omaha', 'miami', - 'oakland', 'minneapolis', 'tulsa', 'wichita', 'neworleans' -] - -# Set the URL of the Facebook Marketplace automotive category -base_url = 'https://www.facebook.com/marketplace/{}/vehicles' -urls = [base_url.format(city) for city in cities] - -# Create a new Selenium WebDriver instance - -print("Setting up headless browser") -options = Options() -options.add_argument("--headless=new") - -print("Creating a new Selenium WebDriver instance") -driver = webdriver.Chrome(options=options) - -# Create a list to store the scraped data -print("Started scraping...") -data = {} -for url in urls: - # Navigate to the URL - print(f"Navigating to {url}") - driver.get(url) - - print(f"Loading {url}") - - time.sleep(2) - scroll = 2000 - - # Wait for the page to load - time.sleep(2) - - for i in range(50): - driver.execute_script(f"window.scrollTo(1, {scroll})") - scroll += 1000 - time.sleep(.5) - - # Get the HTML of the page - html = driver.page_source - - # Create a BeautifulSoup object from the HTML - soup = BeautifulSoup(html, 'html.parser') - - # Find all of the automotive listings on the page - car_posts = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24') - - # Iterate over the listings and scrape the data - for post in car_posts: - print("Scraping new listing") - try: - # Get the title of the listing - title = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text - except AttributeError: - title = 'N/A' # Handle missing title - - try: - # Get the price of the listing - price = post.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text - except AttributeError: - price = 'N/A' # Handle missing price - - try: - # Get the location of the listing - location = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text - except AttributeError: - location = 'N/A' # Handle missing location - - try: - # Get the miles of the car - miles = post.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text - except (AttributeError, IndexError): - miles = 'N/A' # Handle missing miles - - try: - # Get the link to the listing - link = 'https://www.facebook.com' + post.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href'] - except (AttributeError, TypeError): - link = 'N/A' # Handle missing link - - # Add the data to the list - if (title, price, location, miles, link) not in data: - data[(title, price, location, miles, link)] = True - postSuccess = database.post_raw("facebook", title, price, location, miles, link) - if (postSuccess): - print("Save to DB") - else: - print("Failed to save to DB") - else: - print("Listing is a duplicate") - - -# Close the Selenium WebDriver instance -driver.quit() \ No newline at end of file +postClass = "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24" +linkClass = "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv" +thumbnailClass = "xt7dq6l xl1xv1r x6ikm8r x10wlt62 xh8yej3" +titleClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6" +priceClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7 x1s688f xzsf02u" +metaClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft" + +def loadPageResources(driver): + scroll = 100 + + print("Waiting to load...") + time.sleep(2) + utils.scrollTo(scroll, driver) + time.sleep(1.5) + + # Emulate a user scrolling + for i in range(10): + scroll += 1000 + utils.scrollTo(scroll, driver) + time.sleep(1) + + +def setupURLs(oldestAllowedCars): + # List of TX cities to scrape; can be expanded + cities = ['houston', 'dallas', 'austin', 'fortworth', 'elpaso', 'sanantonio'] + + # Set the URL of the Facebook Marketplace automotive category + base_url = 'https://www.facebook.com/marketplace/{}/vehicles?minYear={}&exact=false' + return [base_url.format(city, oldestAllowedCars) for city in cities] + +def getAllPosts(browser): + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, 'html.parser') + + # Find all of the car listings on the page + return soup.find_all('div', class_=postClass) + +def getCarInfo(post): + title = post.find('span', class_=titleClass).text + + print(f'Scraping "{title}"') + + price = post.find('span', class_=priceClass).text + metadata = post.findAll('span', class_=metaClass) + + location = metadata[0].text + odometer = metadata[1].text + + link = post.find('a', class_=linkClass, href=True)["href"] + link = "https://facebook.com" + link + + thumbnail = post.find('img', class_=thumbnailClass)["src"] + + return title, price, location, odometer, link, [thumbnail] + +def getCarImages(): + # class="x1a0syf3 x1ja2u2z" + return "TODO" + +def processAttributes(attributes): + processedAttributes = [] + + for attr in attributes: + [label, value] = attr.split(": ") + processedAttributes.append({"label": label, "value": value}) + + return processedAttributes + +def scrapeListing(url): + browser = setupBrowser() + + # Navigate to the URL + print(f"Going to {url}") + browser.get(url) + + print(f"Loading page for {url}") + time.sleep(1) + + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, 'html.parser') + + try: + description = soup.find('section', id='postingbody').text + attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) + + map = soup.find('div', id='map') + longitude = map["data-longitude"] + latitude = map["data-latitude"] + + print([attributes, description, longitude, latitude]) + except: + print(f"Failed scraping {url}") + + # Close the Selenium WebDriver instance + browser.quit() \ No newline at end of file diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index e721237..883d3fb 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -10,8 +10,8 @@ app = typer.Typer() @app.command() -def craigslist(minYear: Annotated[Optional[int], typer.Argument()] = 2011): - cityURLs = cl.setupURLs(minYear) +def craigslist(): + cityURLs = cl.setupURLs(2011) browser = utils.setupBrowser() for url in cityURLs: @@ -32,6 +32,29 @@ def craigslist(minYear: Annotated[Optional[int], typer.Argument()] = 2011): browser.quit() +@app.command() +def facebook(): + cityURLs = fb.setupURLs(2011) + browser = utils.setupBrowser() + + for url in cityURLs: + print(f"Going to {url}") + browser.get(url) + + print(f"Loading cars from {url}") + fb.loadPageResources(browser) + + carPosts = fb.getAllPosts(browser) + + for post in carPosts: + try: + title, price, location, odometer, link, images = fb.getCarInfo(post) + db.post_raw("facebook", title, price, location, odometer, link, images) + except Exception as error: + print(error) + + browser.quit() + @app.command() def link(link: str): if (".craigslist.org" in link): From f1298de3adc3a27947040eb2171fbb1247934635 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 11:40:56 -0500 Subject: [PATCH 21/48] Extracted scraper logic into utils --- scrapers/scrapers.py | 51 ++++++-------------------------------------- scrapers/utils.py | 26 +++++++++++++++++++++- 2 files changed, 31 insertions(+), 46 deletions(-) diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index 883d3fb..b10029f 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -2,65 +2,26 @@ from typing_extensions import Annotated import typer -import craigslist as cl -import facebook as fb -import database as db +import craigslist +import facebook import utils app = typer.Typer() @app.command() def craigslist(): - cityURLs = cl.setupURLs(2011) - browser = utils.setupBrowser() - - for url in cityURLs: - print(f"Going to {url}") - browser.get(url) - - print(f"Loading cars from {url}") - cl.loadPageResources(browser) - - carPosts = cl.getAllPosts(browser) - - for post in carPosts: - try: - title, price, location, odometer, link, images = cl.getCarInfo(post) - db.post_raw("craigslist", title, price, location, odometer, link, images) - except Exception as error: - print(error) - - browser.quit() + utils.scrape(craigslist, "craigslist") @app.command() def facebook(): - cityURLs = fb.setupURLs(2011) - browser = utils.setupBrowser() - - for url in cityURLs: - print(f"Going to {url}") - browser.get(url) - - print(f"Loading cars from {url}") - fb.loadPageResources(browser) - - carPosts = fb.getAllPosts(browser) - - for post in carPosts: - try: - title, price, location, odometer, link, images = fb.getCarInfo(post) - db.post_raw("facebook", title, price, location, odometer, link, images) - except Exception as error: - print(error) - - browser.quit() + utils.scrape(facebook, "facebook") @app.command() def link(link: str): if (".craigslist.org" in link): - cl.scrapeListing(link) + craigslist.scrapeListing(link) elif("https://www.facebook.com/marketplace" in link): - print("facebook marketplace") + facebook.scrapeListing(link) else: print("Not a Craigslist nor a Facebook Marketplace link") diff --git a/scrapers/utils.py b/scrapers/utils.py index 2a5a865..d8cc307 100644 --- a/scrapers/utils.py +++ b/scrapers/utils.py @@ -1,5 +1,7 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options +import utils +import database as db def scrollTo(x, driver): driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") @@ -11,4 +13,26 @@ def setupBrowser(): options.add_argument("--headless=new") print("Creating a new Selenium WebDriver instance") - return webdriver.Chrome(options=options) \ No newline at end of file + return webdriver.Chrome(options=options) + +def scrape(scraper, website): + cityURLs = scraper.setupURLs(2011) + browser = utils.setupBrowser() + + for url in cityURLs: + print(f"Going to {url}") + browser.get(url) + + print(f"Loading cars from {url}") + scraper.loadPageResources(browser) + + carPosts = scraper.getAllPosts(browser) + + for post in carPosts: + try: + title, price, location, odometer, link, images = scraper.getCarInfo(post) + db.post_raw(website, title, price, location, odometer, link, images) + except Exception as error: + print(error) + + browser.quit() \ No newline at end of file From f6c0a03b49e36cd7ae5c5b3f403c4ce195ae4d9f Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 11:41:31 -0500 Subject: [PATCH 22/48] removed un-used imports --- scrapers/scrapers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index b10029f..efaf399 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -1,7 +1,4 @@ -from typing import Optional -from typing_extensions import Annotated import typer - import craigslist import facebook import utils From 70c01d4f24b38c1925c28cbc2a2af9cc8f6db521 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 12:02:01 -0500 Subject: [PATCH 23/48] Track scraper versions in db --- scrapers/database.py | 9 +++++---- scrapers/scrapers.py | 7 +++++-- scrapers/utils.py | 4 ++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/scrapers/database.py b/scrapers/database.py index ed36ac8..10916ca 100644 --- a/scrapers/database.py +++ b/scrapers/database.py @@ -20,15 +20,16 @@ def get_conn(db): # use a database named "Test" return {"success" : True, "db": client.get_database(db)} -def post_raw(source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None): +def post_raw(scraperVersion, source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None): car = { + "source": source, + "scraperVersion": scraperVersion, + "scrapeDate": str(date.today()), "title": title, "price": price, "location": location, "odometer": miles, - "link": link, - "source": source, - "scrapeDate": str(date.today()) + "link": link } if (images is not None): diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index efaf399..8f6d92a 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -5,13 +5,16 @@ app = typer.Typer() +craigslistScraperVersion = 1 +facebookScraperVersion = 1 + @app.command() def craigslist(): - utils.scrape(craigslist, "craigslist") + utils.scrape(craigslist, "craigslist", craigslistScraperVersion) @app.command() def facebook(): - utils.scrape(facebook, "facebook") + utils.scrape(facebook, "facebook", facebookScraperVersion) @app.command() def link(link: str): diff --git a/scrapers/utils.py b/scrapers/utils.py index d8cc307..2daf8f7 100644 --- a/scrapers/utils.py +++ b/scrapers/utils.py @@ -15,7 +15,7 @@ def setupBrowser(): print("Creating a new Selenium WebDriver instance") return webdriver.Chrome(options=options) -def scrape(scraper, website): +def scrape(scraper, website, scraperVersion): cityURLs = scraper.setupURLs(2011) browser = utils.setupBrowser() @@ -31,7 +31,7 @@ def scrape(scraper, website): for post in carPosts: try: title, price, location, odometer, link, images = scraper.getCarInfo(post) - db.post_raw(website, title, price, location, odometer, link, images) + db.post_raw(scraperVersion, website, title, price, location, odometer, link, images) except Exception as error: print(error) From 2182688247f0ba3db1d163358e6fefb2e51a685e Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 12:48:46 -0500 Subject: [PATCH 24/48] use link as _id for db --- scrapers/craigslist.py | 2 +- scrapers/database.py | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py index 213cf45..1f4553d 100644 --- a/scrapers/craigslist.py +++ b/scrapers/craigslist.py @@ -69,7 +69,7 @@ def processAttributes(attributes): return processedAttributes def scrapeListing(url): - browser = setupBrowser() + browser = utils.setupBrowser() # Navigate to the URL print(f"Going to {url}") diff --git a/scrapers/database.py b/scrapers/database.py index 10916ca..e168344 100644 --- a/scrapers/database.py +++ b/scrapers/database.py @@ -22,6 +22,7 @@ def get_conn(db): def post_raw(scraperVersion, source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None): car = { + "_id": link, "source": source, "scraperVersion": scraperVersion, "scrapeDate": str(date.today()), @@ -49,10 +50,24 @@ def post_raw(scraperVersion, source, title, price, location, miles, link, images car[attr["label"]] = attr["value"] # Insert into collection called "scrape_test" - conn = get_conn("scrape") + conn = get_conn("Test") if (conn["success"]): - result = conn["db"]["scraped_raw"].insert_one(car) + result = conn["db"]["raw"].insert_one(car) return result.acknowledged else: - return False \ No newline at end of file + return False + +def update(link, newFields): + conn = get_conn("Test") + if (conn["success"]): + result = conn["db"]["raw"].update( + {'_id': link}, + { + '$set': newFields + } + ) + return result.acknowledged + else: + return False + From 146948aa643a9128b469a70cbbc7483e68592e59 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 12:49:23 -0500 Subject: [PATCH 25/48] Fixed import issues --- scrapers/scrapers.py | 12 ++++++------ scrapers/utils.py | 16 ++++++++++++++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index 8f6d92a..aeaab0d 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -1,6 +1,6 @@ import typer -import craigslist -import facebook +import craigslist as cl +import facebook as fb import utils app = typer.Typer() @@ -10,18 +10,18 @@ @app.command() def craigslist(): - utils.scrape(craigslist, "craigslist", craigslistScraperVersion) + utils.scrape("craigslist", craigslistScraperVersion) @app.command() def facebook(): - utils.scrape(facebook, "facebook", facebookScraperVersion) + utils.scrape("facebook", facebookScraperVersion) @app.command() def link(link: str): if (".craigslist.org" in link): - craigslist.scrapeListing(link) + cl.scrapeListing(link) elif("https://www.facebook.com/marketplace" in link): - facebook.scrapeListing(link) + fb.scrapeListing(link) else: print("Not a Craigslist nor a Facebook Marketplace link") diff --git a/scrapers/utils.py b/scrapers/utils.py index 2daf8f7..a62a1bc 100644 --- a/scrapers/utils.py +++ b/scrapers/utils.py @@ -3,6 +3,9 @@ import utils import database as db +import craigslist +import facebook + def scrollTo(x, driver): driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") @@ -15,7 +18,12 @@ def setupBrowser(): print("Creating a new Selenium WebDriver instance") return webdriver.Chrome(options=options) -def scrape(scraper, website, scraperVersion): +def scrape(website, scraperVersion): + if (website == 'craigslist'): + scraper = craigslist + elif (website == 'facebook'): + scraper = facebook + cityURLs = scraper.setupURLs(2011) browser = utils.setupBrowser() @@ -31,7 +39,11 @@ def scrape(scraper, website, scraperVersion): for post in carPosts: try: title, price, location, odometer, link, images = scraper.getCarInfo(post) - db.post_raw(scraperVersion, website, title, price, location, odometer, link, images) + success = db.post_raw(scraperVersion, website, title, price, location, odometer, link, images) + if (success): + print("posted to db") + else: + print("failed to post to db") except Exception as error: print(error) From 7f893e09763783b7f5355407e177438bacf434d5 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 13:02:45 -0500 Subject: [PATCH 26/48] craigslist scraper functionality complete --- scrapers/craigslist.py | 16 +++++++++++----- scrapers/database.py | 6 +++--- scrapers/scrapers.py | 6 ++++-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py index 1f4553d..82a1706 100644 --- a/scrapers/craigslist.py +++ b/scrapers/craigslist.py @@ -64,7 +64,7 @@ def processAttributes(attributes): for attr in attributes: [label, value] = attr.split(": ") - processedAttributes.append({"label": label, "value": value}) + processedAttributes.append({"label": label.replace(" ", "-").lower(), "value": value}) return processedAttributes @@ -85,12 +85,18 @@ def scrapeListing(url): try: description = soup.find('section', id='postingbody').text attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) - map = soup.find('div', id='map') - longitude = map["data-longitude"] - latitude = map["data-latitude"] - print([attributes, description, longitude, latitude]) + car = { + "postBody": description, + "longitude": map["data-longitude"], + "latitude": map["data-latitude"] + } + + for attr in attributes: + car[attr["label"]] = attr["value"] + + return car except: print(f"Failed scraping {url}") diff --git a/scrapers/database.py b/scrapers/database.py index e168344..40285e2 100644 --- a/scrapers/database.py +++ b/scrapers/database.py @@ -24,8 +24,8 @@ def post_raw(scraperVersion, source, title, price, location, miles, link, images car = { "_id": link, "source": source, - "scraperVersion": scraperVersion, - "scrapeDate": str(date.today()), + "scraper-version": scraperVersion, + "scrape-date": str(date.today()), "title": title, "price": price, "location": location, @@ -61,7 +61,7 @@ def post_raw(scraperVersion, source, title, price, location, miles, link, images def update(link, newFields): conn = get_conn("Test") if (conn["success"]): - result = conn["db"]["raw"].update( + result = conn["db"]["raw"].update_one( {'_id': link}, { '$set': newFields diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index aeaab0d..a3795e3 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -1,7 +1,8 @@ import typer +import utils import craigslist as cl import facebook as fb -import utils +import database as db app = typer.Typer() @@ -19,7 +20,8 @@ def facebook(): @app.command() def link(link: str): if (".craigslist.org" in link): - cl.scrapeListing(link) + newInfo = cl.scrapeListing(link) + db.update(link, newInfo) elif("https://www.facebook.com/marketplace" in link): fb.scrapeListing(link) else: From b40d0c7c93d98e0f99a106b3639f65c12a628e24 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 13:49:46 -0500 Subject: [PATCH 27/48] extracted click function into utils --- scrapers/craigslist.py | 2 +- scrapers/utils.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py index 82a1706..8dc92cc 100644 --- a/scrapers/craigslist.py +++ b/scrapers/craigslist.py @@ -19,7 +19,7 @@ def loadPageResources(driver): scroll += 100 utils.scrollTo(scroll, driver) - driver.execute_script("arguments[0].click();", loadImgButtons[i]) + utils.clickOn(loadImgButtons[i], driver) time.sleep(.5) diff --git a/scrapers/utils.py b/scrapers/utils.py index a62a1bc..add41cf 100644 --- a/scrapers/utils.py +++ b/scrapers/utils.py @@ -9,6 +9,9 @@ def scrollTo(x, driver): driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") +def clickOn(elem, driver): + driver.execute_script("arguments[0].click();", elem) + def setupBrowser(): print("Setting up headless browser") From 8d9b30a8525ed964bf3004ee6a9bf71d20c631b8 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Tue, 31 Oct 2023 13:50:06 -0500 Subject: [PATCH 28/48] Facebook listing scraper incomplete --- scrapers/facebook.py | 39 ++++++++++++++++++++++++++------------- scrapers/scrapers.py | 3 ++- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/scrapers/facebook.py b/scrapers/facebook.py index 5b8c43f..b11f937 100644 --- a/scrapers/facebook.py +++ b/scrapers/facebook.py @@ -9,6 +9,10 @@ priceClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7 x1s688f xzsf02u" metaClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft" +listingInfoClass = "x78zum5 xdt5ytf x1iyjqo2 x1n2onr6" +listingSectionClass = "xod5an3" +bodyClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u" + def loadPageResources(driver): scroll = 100 @@ -72,13 +76,13 @@ def processAttributes(attributes): return processedAttributes def scrapeListing(url): - browser = setupBrowser() + browser = utils.setupBrowser() # Navigate to the URL - print(f"Going to {url}") - browser.get(url) + print(f"Going to {url[0:60]}") + browser.get(url[0:60]) - print(f"Loading page for {url}") + print(f"Loading page for {url[0:60]}") time.sleep(1) # Create a BeautifulSoup object from the HTML of the page @@ -86,16 +90,25 @@ def scrapeListing(url): soup = BeautifulSoup(html, 'html.parser') try: - description = soup.find('section', id='postingbody').text - attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) - - map = soup.find('div', id='map') - longitude = map["data-longitude"] - latitude = map["data-latitude"] + seeMoreButton = browser.find_element("class name", "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x6prxxf xvq8zen x1s688f xzsf02u".replace(" ", ".")) + utils.clickOn(seeMoreButton, browser) + + listingInfo = soup.find('div', class_=listingInfoClass) + # description = listingInfo.find('span', class_="x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u") + print(listingInfo) - print([attributes, description, longitude, latitude]) - except: - print(f"Failed scraping {url}") + return 2 + + # attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) + + # map = soup.find('div', id='map') + # longitude = map["data-longitude"] + # latitude = map["data-latitude"] + + # print([attributes, description, longitude, latitude]) + except Exception as error: + print(error) + return -1 # Close the Selenium WebDriver instance browser.quit() \ No newline at end of file diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index a3795e3..e3e1e73 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -23,7 +23,8 @@ def link(link: str): newInfo = cl.scrapeListing(link) db.update(link, newInfo) elif("https://www.facebook.com/marketplace" in link): - fb.scrapeListing(link) + newInfo = fb.scrapeListing(link) + print(newInfo) else: print("Not a Craigslist nor a Facebook Marketplace link") From e91838c39fc3a3792eca388647c8461f46812550 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 9 Nov 2023 13:04:10 -0600 Subject: [PATCH 29/48] added pipfile and pipfile.lock to manage dependencies --- scrapers/Pipfile | 16 +++ scrapers/Pipfile.lock | 281 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 297 insertions(+) create mode 100644 scrapers/Pipfile create mode 100644 scrapers/Pipfile.lock diff --git a/scrapers/Pipfile b/scrapers/Pipfile new file mode 100644 index 0000000..d054d18 --- /dev/null +++ b/scrapers/Pipfile @@ -0,0 +1,16 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +selenium = "*" +bs4 = "*" +pymongo = "*" +typer = "*" +python-dotenv = "*" + +[dev-packages] + +[requires] +python_version = "3.11" diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock new file mode 100644 index 0000000..00b7d4f --- /dev/null +++ b/scrapers/Pipfile.lock @@ -0,0 +1,281 @@ +{ + "_meta": { + "hash": { + "sha256": "11dbba9e7645169d8dd9e6cfe9118716f9e853beec34798dce95b25c651c9695" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.11" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "attrs": { + "hashes": [ + "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04", + "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015" + ], + "markers": "python_version >= '3.7'", + "version": "==23.1.0" + }, + "beautifulsoup4": { + "hashes": [ + "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da", + "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==4.12.2" + }, + "bs4": { + "hashes": [ + "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" + ], + "index": "pypi", + "version": "==0.0.1" + }, + "certifi": { + "hashes": [ + "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082", + "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9" + ], + "markers": "python_version >= '3.6'", + "version": "==2023.7.22" + }, + "click": { + "hashes": [ + "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", + "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de" + ], + "markers": "python_version >= '3.7'", + "version": "==8.1.7" + }, + "dnspython": { + "hashes": [ + "sha256:57c6fbaaeaaf39c891292012060beb141791735dbb4004798328fc2c467402d8", + "sha256:8dcfae8c7460a2f84b4072e26f1c9f4101ca20c071649cb7c34e8b6a93d58984" + ], + "markers": "python_version >= '3.8' and python_version < '4.0'", + "version": "==2.4.2" + }, + "h11": { + "hashes": [ + "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", + "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761" + ], + "markers": "python_version >= '3.7'", + "version": "==0.14.0" + }, + "idna": { + "hashes": [ + "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", + "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" + ], + "markers": "python_version >= '3.5'", + "version": "==3.4" + }, + "outcome": { + "hashes": [ + "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", + "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b" + ], + "markers": "python_version >= '3.7'", + "version": "==1.3.0.post0" + }, + "pymongo": { + "hashes": [ + "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330", + "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651", + "sha256:05c30fd35cc97f14f354916b45feea535d59060ef867446b5c3c7f9b609dd5dc", + "sha256:0634994b026336195778e5693583c060418d4ab453eff21530422690a97e1ee8", + "sha256:09c7de516b08c57647176b9fc21d929d628e35bcebc7422220c89ae40b62126a", + "sha256:107a234dc55affc5802acb3b6d83cbb8c87355b38a9457fcd8806bdeb8bce161", + "sha256:10a379fb60f1b2406ae57b8899bacfe20567918c8e9d2d545e1b93628fcf2050", + "sha256:128b1485753106c54af481789cdfea12b90a228afca0b11fb3828309a907e10e", + "sha256:1394c4737b325166a65ae7c145af1ebdb9fb153ebedd37cf91d676313e4a67b8", + "sha256:1c63e3a2e8fb815c4b1f738c284a4579897e37c3cfd95fdb199229a1ccfb638a", + "sha256:1e4ed21029d80c4f62605ab16398fe1ce093fff4b5f22d114055e7d9fbc4adb0", + "sha256:1ec71ac633b126c0775ed4604ca8f56c3540f5c21a1220639f299e7a544b55f9", + "sha256:21812453354b151200034750cd30b0140e82ec2a01fd4357390f67714a1bfbde", + "sha256:256c503a75bd71cf7fb9ebf889e7e222d49c6036a48aad5a619f98a0adf0e0d7", + "sha256:2703a9f8f5767986b4f51c259ff452cc837c5a83c8ed5f5361f6e49933743b2f", + "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a", + "sha256:2972dd1f1285866aba027eff2f4a2bbf8aa98563c2ced14cb34ee5602b36afdf", + "sha256:2973f113e079fb98515722cd728e1820282721ec9fd52830e4b73cabdbf1eb28", + "sha256:2ca0ba501898b2ec31e6c3acf90c31910944f01d454ad8e489213a156ccf1bda", + "sha256:2d2be5c9c3488fa8a70f83ed925940f488eac2837a996708d98a0e54a861f212", + "sha256:2f8c04277d879146eacda920476e93d520eff8bec6c022ac108cfa6280d84348", + "sha256:325701ae7b56daa5b0692305b7cb505ca50f80a1288abb32ff420a8a209b01ca", + "sha256:3729b8db02063da50eeb3db88a27670d85953afb9a7f14c213ac9e3dca93034b", + "sha256:3919708594b86d0f5cdc713eb6fccd3f9b9532af09ea7a5d843c933825ef56c4", + "sha256:39a1cd5d383b37285641d5a7a86be85274466ae336a61b51117155936529f9b3", + "sha256:3ec6c20385c5a58e16b1ea60c5e4993ea060540671d7d12664f385f2fb32fe79", + "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4", + "sha256:49f2af6cf82509b15093ce3569229e0d53c90ad8ae2eef940652d4cf1f81e045", + "sha256:4a0269811661ba93c472c8a60ea82640e838c2eb148d252720a09b5123f2c2fe", + "sha256:518c90bdd6e842c446d01a766b9136fec5ec6cc94f3b8c3f8b4a332786ee6b64", + "sha256:5717a308a703dda2886a5796a07489c698b442f5e409cf7dc2ac93de8d61d764", + "sha256:5802acc012bbb4bce4dff92973dff76482f30ef35dd4cb8ab5b0e06aa8f08c80", + "sha256:5e63146dbdb1eac207464f6e0cfcdb640c9c5ff0f57b754fa96fe252314a1dc6", + "sha256:6695d7136a435c1305b261a9ddb9b3ecec9863e05aab3935b96038145fd3a977", + "sha256:680fa0fc719e1a3dcb81130858368f51d83667d431924d0bcf249644bce8f303", + "sha256:6b18276f14b4b6d92e707ab6db19b938e112bd2f1dc3f9f1a628df58e4fd3f0d", + "sha256:6bafea6061d63059d8bc2ffc545e2f049221c8a4457d236c5cd6a66678673eab", + "sha256:6d6a1b1361f118e7fefa17ae3114e77f10ee1b228b20d50c47c9f351346180c8", + "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34", + "sha256:79f41576b3022c2fe9780ae3e44202b2438128a25284a8ddfa038f0785d87019", + "sha256:7b0e6361754ac596cd16bfc6ed49f69ffcd9b60b7bc4bcd3ea65c6a83475e4ff", + "sha256:7e3b0127b260d4abae7b62203c4c7ef0874c901b55155692353db19de4b18bc4", + "sha256:7fc2bb8a74dcfcdd32f89528e38dcbf70a3a6594963d60dc9595e3b35b66e414", + "sha256:806e094e9e85d8badc978af8c95b69c556077f11844655cb8cd2d1758769e521", + "sha256:81dd1308bd5630d2bb5980f00aa163b986b133f1e9ed66c66ce2a5bc3572e891", + "sha256:82e620842e12e8cb4050d2643a81c8149361cd82c0a920fa5a15dc4ca8a4000f", + "sha256:85f2cdc400ee87f5952ebf2a117488f2525a3fb2e23863a8efe3e4ee9e54e4d1", + "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8", + "sha256:8adf014f2779992eba3b513e060d06f075f0ab2fb3ad956f413a102312f65cdf", + "sha256:9b0f98481ad5dc4cb430a60bbb8869f05505283b9ae1c62bdb65eb5e020ee8e3", + "sha256:9bea9138b0fc6e2218147e9c6ce1ff76ff8e29dc00bb1b64842bd1ca107aee9f", + "sha256:a09bfb51953930e7e838972ddf646c5d5f984992a66d79da6ba7f6a8d8a890cd", + "sha256:a0be99b599da95b7a90a918dd927b20c434bea5e1c9b3efc6a3c6cd67c23f813", + "sha256:a49aca4d961823b2846b739380c847e8964ff7ae0f0a683992b9d926054f0d6d", + "sha256:a4dc1319d0c162919ee7f4ee6face076becae2abbd351cc14f1fe70af5fb20d9", + "sha256:a8273e1abbcff1d7d29cbbb1ea7e57d38be72f1af3c597c854168508b91516c2", + "sha256:a8f7f9feecae53fa18d6a3ea7c75f9e9a1d4d20e5c3f9ce3fba83f07bcc4eee2", + "sha256:ad4f66fbb893b55f96f03020e67dcab49ffde0177c6565ccf9dec4fdf974eb61", + "sha256:af425f323fce1b07755edd783581e7283557296946212f5b1a934441718e7528", + "sha256:b14dd73f595199f4275bed4fb509277470d9b9059310537e3b3daba12b30c157", + "sha256:b4ad70d7cac4ca0c7b31444a0148bd3af01a2662fa12b1ad6f57cd4a04e21766", + "sha256:b80a4ee19b3442c57c38afa978adca546521a8822d663310b63ae2a7d7b13f3a", + "sha256:ba51129fcc510824b6ca6e2ce1c27e3e4d048b6e35d3ae6f7e517bed1b8b25ce", + "sha256:c011bd5ad03cc096f99ffcfdd18a1817354132c1331bed7a837a25226659845f", + "sha256:cc94f9fea17a5af8cf1a343597711a26b0117c0b812550d99934acb89d526ed2", + "sha256:ccd785fafa1c931deff6a7116e9a0d402d59fabe51644b0d0c268295ff847b25", + "sha256:d16a534da0e39785687b7295e2fcf9a339f4a20689024983d11afaa4657f8507", + "sha256:d3077a31633beef77d057c6523f5de7271ddef7bde5e019285b00c0cc9cac1e3", + "sha256:d603edea1ff7408638b2504905c032193b7dcee7af269802dbb35bc8c3310ed5", + "sha256:db082f728160369d9a6ed2e722438291558fc15ce06d0a7d696a8dad735c236b", + "sha256:ddef295aaf80cefb0c1606f1995899efcb17edc6b327eb6589e234e614b87756", + "sha256:e16ade71c93f6814d095d25cd6d28a90d63511ea396bd96e9ffcb886b278baaa", + "sha256:e3db7d833a7c38c317dc95b54e27f1d27012e031b45a7c24e360b53197d5f6e7", + "sha256:e5e193f89f4f8c1fe273f9a6e6df915092c9f2af6db2d1afb8bd53855025c11f", + "sha256:eb438a8bf6b695bf50d57e6a059ff09652a07968b2041178b3744ea785fcef9b", + "sha256:ebf02c32afa6b67e5861a27183dd98ed88419a94a2ab843cc145fb0bafcc5b28", + "sha256:ecd9e1fa97aa11bf67472220285775fa15e896da108f425e55d23d7540a712ce", + "sha256:ef67fedd863ffffd4adfd46d9d992b0f929c7f61a8307366d664d93517f2c78e", + "sha256:f28ae33dc5a0b9cee06e95fd420e42155d83271ab75964baf747ce959cac5f52", + "sha256:fb1c56d891f9e34303c451998ef62ba52659648bb0d75b03c5e4ac223a3342c2", + "sha256:fe03bf25fae4b95d8afe40004a321df644400fdcba4c8e5e1a19c1085b740888" + ], + "index": "pypi", + "markers": "python_version >= '3.7'", + "version": "==4.6.0" + }, + "pysocks": { + "hashes": [ + "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299", + "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", + "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0" + ], + "version": "==1.7.1" + }, + "python-dotenv": { + "hashes": [ + "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba", + "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==1.0.0" + }, + "selenium": { + "hashes": [ + "sha256:22eab5a1724c73d51b240a69ca702997b717eee4ba1f6065bf5d6b44dba01d48", + "sha256:9e82cd1ac647fb73cf0d4a6e280284102aaa3c9d94f0fa6e6cc4b5db6a30afbf" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==4.15.2" + }, + "sniffio": { + "hashes": [ + "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101", + "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384" + ], + "markers": "python_version >= '3.7'", + "version": "==1.3.0" + }, + "sortedcontainers": { + "hashes": [ + "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", + "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0" + ], + "version": "==2.4.0" + }, + "soupsieve": { + "hashes": [ + "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690", + "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7" + ], + "markers": "python_version >= '3.8'", + "version": "==2.5" + }, + "trio": { + "hashes": [ + "sha256:16f89f7dcc8f7b9dcdec1fcd863e0c039af6d0f9a22f8dfd56f75d75ec73fd48", + "sha256:bb4abb3f4af23f96679e7c8cdabb8b234520f2498550d2cf63ebfd95f2ce27fe" + ], + "markers": "python_version >= '3.8'", + "version": "==0.23.1" + }, + "trio-websocket": { + "hashes": [ + "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f", + "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638" + ], + "markers": "python_version >= '3.7'", + "version": "==0.11.1" + }, + "typer": { + "hashes": [ + "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2", + "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee" + ], + "index": "pypi", + "markers": "python_version >= '3.6'", + "version": "==0.9.0" + }, + "typing-extensions": { + "hashes": [ + "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0", + "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef" + ], + "markers": "python_version >= '3.8'", + "version": "==4.8.0" + }, + "urllib3": { + "extras": [ + "socks" + ], + "hashes": [ + "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84", + "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e" + ], + "markers": "python_version >= '3.7'", + "version": "==2.0.7" + }, + "wsproto": { + "hashes": [ + "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065", + "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736" + ], + "markers": "python_full_version >= '3.7.0'", + "version": "==1.2.0" + } + }, + "develop": {} +} From 0e1cd28cec2f4540dab707495507897fb9c068ff Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 9 Nov 2023 13:14:34 -0600 Subject: [PATCH 30/48] organize file structure --- scrapers/scrapers.py | 8 ++++---- scrapers/src/__init__.py | 0 scrapers/{ => src}/craigslist.py | 2 +- scrapers/{ => src}/database.py | 0 scrapers/{ => src}/facebook.py | 2 +- scrapers/{ => src}/utils.py | 9 ++++----- 6 files changed, 10 insertions(+), 11 deletions(-) create mode 100644 scrapers/src/__init__.py rename scrapers/{ => src}/craigslist.py (95%) rename scrapers/{ => src}/database.py (100%) rename scrapers/{ => src}/facebook.py (96%) rename scrapers/{ => src}/utils.py (92%) diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index e3e1e73..78a8b6b 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -1,8 +1,8 @@ import typer -import utils -import craigslist as cl -import facebook as fb -import database as db +from src import utils +from src import craigslist as cl +from src import facebook as fb +from src import database as db app = typer.Typer() diff --git a/scrapers/src/__init__.py b/scrapers/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapers/craigslist.py b/scrapers/src/craigslist.py similarity index 95% rename from scrapers/craigslist.py rename to scrapers/src/craigslist.py index 8dc92cc..d08322c 100644 --- a/scrapers/craigslist.py +++ b/scrapers/src/craigslist.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup import time -import utils +from . import utils def loadPageResources(driver): scroll = 100 diff --git a/scrapers/database.py b/scrapers/src/database.py similarity index 100% rename from scrapers/database.py rename to scrapers/src/database.py diff --git a/scrapers/facebook.py b/scrapers/src/facebook.py similarity index 96% rename from scrapers/facebook.py rename to scrapers/src/facebook.py index b11f937..291c7ec 100644 --- a/scrapers/facebook.py +++ b/scrapers/src/facebook.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup import time -import utils +from . import utils postClass = "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24" linkClass = "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv" diff --git a/scrapers/utils.py b/scrapers/src/utils.py similarity index 92% rename from scrapers/utils.py rename to scrapers/src/utils.py index add41cf..6e1172a 100644 --- a/scrapers/utils.py +++ b/scrapers/src/utils.py @@ -1,10 +1,9 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options -import utils -import database as db +from . import database as db -import craigslist -import facebook +from . import craigslist +from . import facebook def scrollTo(x, driver): driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") @@ -28,7 +27,7 @@ def scrape(website, scraperVersion): scraper = facebook cityURLs = scraper.setupURLs(2011) - browser = utils.setupBrowser() + browser = setupBrowser() for url in cityURLs: print(f"Going to {url}") From c2694a4e44252949834a9f4c12d975f0d91d88fc Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 9 Nov 2023 13:27:54 -0600 Subject: [PATCH 31/48] Update README with scraper instructions --- README.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0c0ecc7..7adf5a1 100644 --- a/README.md +++ b/README.md @@ -5,4 +5,27 @@ Senior Design Repository for the Statefarm Automotive Fraud Project Make a copy of the ``.env.example`` file and make the following changes. 1. remove ``.example`` from the extension 2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus) -3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable \ No newline at end of file +3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable + +## Run Scrapers locally +**Prerequisites** +- python3 +- pipenv + +**Installing dependencies** +Navigate to ``scrapers/`` and run +```bash +pipenv install +``` + +**Scraper Usage** +```bash +# Scrape Craigsist homepage +python3 scrapers.py craigslist + +# Scrape Facebook Marketplace homepage +python3 scrapers.py facebook + +# Scrape a specific carigslist or facebook car listing +python3 scrapers.py link [LINK] +``` \ No newline at end of file From 816d18506c1f329c703a1371b3e43acf03caef5b Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 9 Nov 2023 13:30:17 -0600 Subject: [PATCH 32/48] pipenv shell in README --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7adf5a1..1092af9 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,11 @@ Make a copy of the ``.env.example`` file and make the following changes. - pipenv **Installing dependencies** -Navigate to ``scrapers/`` and run +Navigate to ``scrapers/`` and open the virtual environment using +```bash +pipenv shell +``` +Then install dependencies using ```bash pipenv install ``` From 667183ddf84ecbe8954c8596193994be9e6586c4 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 9 Nov 2023 14:06:23 -0600 Subject: [PATCH 33/48] added pipfile scripts --- scrapers/Pipfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scrapers/Pipfile b/scrapers/Pipfile index d054d18..e94f5d9 100644 --- a/scrapers/Pipfile +++ b/scrapers/Pipfile @@ -3,6 +3,11 @@ url = "https://pypi.org/simple" verify_ssl = true name = "pypi" +[scripts] +craigslist = "python3 scrapers.py craigslist" +facebook = "python3 scrapers.py facebook" +link = "python3 scrapers.py link" + [packages] selenium = "*" bs4 = "*" From f14de608917136a6de791a56b61c6cecc64d18b9 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 9 Nov 2023 14:25:58 -0600 Subject: [PATCH 34/48] update README with pipenv scripts --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1092af9..54a27fd 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,11 @@ pipenv install **Scraper Usage** ```bash # Scrape Craigsist homepage -python3 scrapers.py craigslist +pipenv run craigslist # Scrape Facebook Marketplace homepage -python3 scrapers.py facebook +pipenv run facebook # Scrape a specific carigslist or facebook car listing -python3 scrapers.py link [LINK] +pipenv run link [LINK] ``` \ No newline at end of file From dcd09598e21ccef6163ef9009ae31f05b260068c Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 9 Nov 2023 20:30:53 -0600 Subject: [PATCH 35/48] updated db and collection name --- scrapers/src/database.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scrapers/src/database.py b/scrapers/src/database.py index 40285e2..5c47e9e 100644 --- a/scrapers/src/database.py +++ b/scrapers/src/database.py @@ -2,6 +2,8 @@ import pymongo import os from datetime import date +db = "scrape" +collection = "scraped_raw" def get_conn(db): # load environment variable containing db uri (which includes username and password) @@ -17,7 +19,6 @@ def get_conn(db): print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?") return {"success" : False, "db": 0} - # use a database named "Test" return {"success" : True, "db": client.get_database(db)} def post_raw(scraperVersion, source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None): @@ -49,19 +50,19 @@ def post_raw(scraperVersion, source, title, price, location, miles, link, images for attr in attributes: car[attr["label"]] = attr["value"] - # Insert into collection called "scrape_test" - conn = get_conn("Test") + # Insert into collection called "scrape_raw" + conn = get_conn(db) if (conn["success"]): - result = conn["db"]["raw"].insert_one(car) + result = conn["db"][collection].insert_one(car) return result.acknowledged else: return False def update(link, newFields): - conn = get_conn("Test") + conn = get_conn(db) if (conn["success"]): - result = conn["db"]["raw"].update_one( + result = conn["db"][collection].update_one( {'_id': link}, { '$set': newFields @@ -70,4 +71,3 @@ def update(link, newFields): return result.acknowledged else: return False - From 3e202b04a5aeb044366d0b074c51a73bb4da9b68 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Sun, 12 Nov 2023 18:38:36 -0600 Subject: [PATCH 36/48] finally got docker running locally T-T --- scrapers/Dockerfile | 22 ++++++++++++++++++++++ scrapers/requirements.txt | 23 +++++++++++++++++++++++ scrapers/scrapers.py | 4 ++-- scrapers/src/utils.py | 22 +++++++++++++++++++--- 4 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 scrapers/Dockerfile create mode 100644 scrapers/requirements.txt diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile new file mode 100644 index 0000000..6a2f76a --- /dev/null +++ b/scrapers/Dockerfile @@ -0,0 +1,22 @@ +FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build +RUN yum install -y unzip && \ + curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \ + curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \ + unzip /tmp/chromedriver-linux64.zip -d /opt/ && \ + unzip /tmp/chrome-linux64.zip -d /opt/ + +FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 +RUN yum install atk cups-libs gtk3 libXcomposite alsa-lib \ + libXcursor libXdamage libXext libXi libXrandr libXScrnSaver \ + libXtst pango at-spi2-atk libXt xorg-x11-server-Xvfb \ + xorg-x11-xauth dbus-glib dbus-glib-devel -y +COPY --from=build /opt/chrome-linux64 /opt/chrome +COPY --from=build /opt/chromedriver-linux64 /opt/ + +COPY scrapers.py ./ +COPY src ./src +COPY requirements.txt ./ + +RUN pip install -r requirements.txt + +CMD [ "scrapers.craigslist" ] \ No newline at end of file diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt new file mode 100644 index 0000000..defce39 --- /dev/null +++ b/scrapers/requirements.txt @@ -0,0 +1,23 @@ +-i https://pypi.org/simple +attrs==23.1.0; python_version >= '3.7' +beautifulsoup4==4.12.2; python_full_version >= '3.6.0' +bs4==0.0.1 +certifi==2023.7.22; python_version >= '3.6' +click==8.1.7; python_version >= '3.7' +dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0' +h11==0.14.0; python_version >= '3.7' +idna==3.4; python_version >= '3.5' +outcome==1.3.0.post0; python_version >= '3.7' +pymongo==4.6.0; python_version >= '3.7' +pysocks==1.7.1 +python-dotenv==1.0.0; python_version >= '3.8' +selenium==4.15.2; python_version >= '3.8' +sniffio==1.3.0; python_version >= '3.7' +sortedcontainers==2.4.0 +soupsieve==2.5; python_version >= '3.8' +trio==0.23.1; python_version >= '3.8' +trio-websocket==0.11.1; python_version >= '3.7' +typer==0.9.0; python_version >= '3.6' +typing-extensions==4.8.0; python_version >= '3.8' +urllib3[socks]==2.0.7; python_version >= '3.7' +wsproto==1.2.0; python_full_version >= '3.7.0' diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index 78a8b6b..284caeb 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -10,11 +10,11 @@ facebookScraperVersion = 1 @app.command() -def craigslist(): +def craigslist(event, context): utils.scrape("craigslist", craigslistScraperVersion) @app.command() -def facebook(): +def facebook(event, context): utils.scrape("facebook", facebookScraperVersion) @app.command() diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py index 6e1172a..6d51bbe 100644 --- a/scrapers/src/utils.py +++ b/scrapers/src/utils.py @@ -11,14 +11,30 @@ def scrollTo(x, driver): def clickOn(elem, driver): driver.execute_script("arguments[0].click();", elem) +def createDriverOptions(): + options = webdriver.ChromeOptions() + options.binary_location = '/opt/chrome/chrome' + + options.add_argument("--headless=new") + options.add_argument("--headless=new") + options.add_argument('--no-sandbox') + options.add_argument("--disable-gpu") + options.add_argument("--window-size=1280x1696") + options.add_argument("--single-process") + options.add_argument("--disable-dev-shm-usage") + options.add_argument("--disable-dev-tools") + options.add_argument("--no-zygote") + + return options + def setupBrowser(): print("Setting up headless browser") - options = Options() - options.add_argument("--headless=new") + service = webdriver.ChromeService("/opt/chromedriver") + options = createDriverOptions() print("Creating a new Selenium WebDriver instance") - return webdriver.Chrome(options=options) + return webdriver.Chrome(options=options, service=service) def scrape(website, scraperVersion): if (website == 'craigslist'): From 21e751c16d06e2624bec311b6b5efb540d4e9cab Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Sun, 12 Nov 2023 19:35:45 -0600 Subject: [PATCH 37/48] use environ.get instead of getenv --- .env.example => scrapers/.env.example | 0 scrapers/src/database.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename .env.example => scrapers/.env.example (100%) diff --git a/.env.example b/scrapers/.env.example similarity index 100% rename from .env.example rename to scrapers/.env.example diff --git a/scrapers/src/database.py b/scrapers/src/database.py index 5c47e9e..834c14c 100644 --- a/scrapers/src/database.py +++ b/scrapers/src/database.py @@ -8,7 +8,7 @@ def get_conn(db): # load environment variable containing db uri (which includes username and password) load_dotenv() - db_uri = os.getenv("DB_URI") + db_uri = os.environ.get("DB_URI") # create a mongodb connection try: From 5a5515f5ac45d804d82cda093dc5b05778055e66 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Fri, 17 Nov 2023 10:21:32 -0600 Subject: [PATCH 38/48] updated pipenv scripts to work for docker locally --- scrapers/Pipfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scrapers/Pipfile b/scrapers/Pipfile index e94f5d9..4eeccc8 100644 --- a/scrapers/Pipfile +++ b/scrapers/Pipfile @@ -4,9 +4,11 @@ verify_ssl = true name = "pypi" [scripts] -craigslist = "python3 scrapers.py craigslist" -facebook = "python3 scrapers.py facebook" -link = "python3 scrapers.py link" +build = "docker build --platform linux/amd64 -t smare ." +cont = "docker run --name smarecontainer -d smare:latest" +exec = "docker exec -it smarecontainer" +craigslist = "pipenv run exec python3 -c 'import scrapers; scrapers.craigslist(\"\",\"\")'" +facebook = "pipenv run exec python3 -c 'import scrapers; scrapers.facebook(\"\",\"\")'" [packages] selenium = "*" From 5e86f1c770707076b3e57dd83860958d4a8af4f5 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Fri, 17 Nov 2023 10:24:35 -0600 Subject: [PATCH 39/48] update README with new pipenv commands --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 54a27fd..f2928fc 100644 --- a/README.md +++ b/README.md @@ -23,13 +23,19 @@ pipenv install ``` **Scraper Usage** +To create build a Docker Image use +```bash +pipenv run build +``` +to run a docker container "smarecontainer" use +```bash +pipenv run cont +``` +then ```bash # Scrape Craigsist homepage pipenv run craigslist # Scrape Facebook Marketplace homepage pipenv run facebook - -# Scrape a specific carigslist or facebook car listing -pipenv run link [LINK] ``` \ No newline at end of file From 81f4f40142a8baa37ba216563dea9131cbe8c330 Mon Sep 17 00:00:00 2001 From: Waseem Polus <69316929+waseem-polus@users.noreply.github.com> Date: Fri, 17 Nov 2023 10:45:28 -0600 Subject: [PATCH 40/48] Add missing new lines --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f2928fc..1a0d75c 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Make a copy of the ``.env.example`` file and make the following changes. - python3 - pipenv -**Installing dependencies** +**Installing dependencies** Navigate to ``scrapers/`` and open the virtual environment using ```bash pipenv shell @@ -22,7 +22,7 @@ Then install dependencies using pipenv install ``` -**Scraper Usage** +**Scraper Usage** To create build a Docker Image use ```bash pipenv run build @@ -38,4 +38,4 @@ pipenv run craigslist # Scrape Facebook Marketplace homepage pipenv run facebook -``` \ No newline at end of file +``` From fd2cf6f33a792c264a67bf4816dc3c0fe0402d7c Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 30 Nov 2023 00:21:06 -0600 Subject: [PATCH 41/48] use regex to determine if facebook or craigslist link --- scrapers/Pipfile.lock | 184 +++++++++++++++++++------------------- scrapers/requirements.txt | 8 +- scrapers/scrapers.py | 8 +- 3 files changed, 102 insertions(+), 98 deletions(-) diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock index 00b7d4f..689e92b 100644 --- a/scrapers/Pipfile.lock +++ b/scrapers/Pipfile.lock @@ -41,11 +41,11 @@ }, "certifi": { "hashes": [ - "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082", - "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9" + "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1", + "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474" ], "markers": "python_version >= '3.6'", - "version": "==2023.7.22" + "version": "==2023.11.17" }, "click": { "hashes": [ @@ -73,11 +73,11 @@ }, "idna": { "hashes": [ - "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", - "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" + "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca", + "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f" ], "markers": "python_version >= '3.5'", - "version": "==3.4" + "version": "==3.6" }, "outcome": { "hashes": [ @@ -89,91 +89,91 @@ }, "pymongo": { "hashes": [ - "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330", - "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651", - "sha256:05c30fd35cc97f14f354916b45feea535d59060ef867446b5c3c7f9b609dd5dc", - "sha256:0634994b026336195778e5693583c060418d4ab453eff21530422690a97e1ee8", - "sha256:09c7de516b08c57647176b9fc21d929d628e35bcebc7422220c89ae40b62126a", - "sha256:107a234dc55affc5802acb3b6d83cbb8c87355b38a9457fcd8806bdeb8bce161", - "sha256:10a379fb60f1b2406ae57b8899bacfe20567918c8e9d2d545e1b93628fcf2050", - "sha256:128b1485753106c54af481789cdfea12b90a228afca0b11fb3828309a907e10e", - "sha256:1394c4737b325166a65ae7c145af1ebdb9fb153ebedd37cf91d676313e4a67b8", - "sha256:1c63e3a2e8fb815c4b1f738c284a4579897e37c3cfd95fdb199229a1ccfb638a", - "sha256:1e4ed21029d80c4f62605ab16398fe1ce093fff4b5f22d114055e7d9fbc4adb0", - "sha256:1ec71ac633b126c0775ed4604ca8f56c3540f5c21a1220639f299e7a544b55f9", - "sha256:21812453354b151200034750cd30b0140e82ec2a01fd4357390f67714a1bfbde", - "sha256:256c503a75bd71cf7fb9ebf889e7e222d49c6036a48aad5a619f98a0adf0e0d7", - "sha256:2703a9f8f5767986b4f51c259ff452cc837c5a83c8ed5f5361f6e49933743b2f", - "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a", - "sha256:2972dd1f1285866aba027eff2f4a2bbf8aa98563c2ced14cb34ee5602b36afdf", - "sha256:2973f113e079fb98515722cd728e1820282721ec9fd52830e4b73cabdbf1eb28", - "sha256:2ca0ba501898b2ec31e6c3acf90c31910944f01d454ad8e489213a156ccf1bda", - "sha256:2d2be5c9c3488fa8a70f83ed925940f488eac2837a996708d98a0e54a861f212", - "sha256:2f8c04277d879146eacda920476e93d520eff8bec6c022ac108cfa6280d84348", - "sha256:325701ae7b56daa5b0692305b7cb505ca50f80a1288abb32ff420a8a209b01ca", - "sha256:3729b8db02063da50eeb3db88a27670d85953afb9a7f14c213ac9e3dca93034b", - "sha256:3919708594b86d0f5cdc713eb6fccd3f9b9532af09ea7a5d843c933825ef56c4", - "sha256:39a1cd5d383b37285641d5a7a86be85274466ae336a61b51117155936529f9b3", - "sha256:3ec6c20385c5a58e16b1ea60c5e4993ea060540671d7d12664f385f2fb32fe79", - "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4", - "sha256:49f2af6cf82509b15093ce3569229e0d53c90ad8ae2eef940652d4cf1f81e045", - "sha256:4a0269811661ba93c472c8a60ea82640e838c2eb148d252720a09b5123f2c2fe", - "sha256:518c90bdd6e842c446d01a766b9136fec5ec6cc94f3b8c3f8b4a332786ee6b64", - "sha256:5717a308a703dda2886a5796a07489c698b442f5e409cf7dc2ac93de8d61d764", - "sha256:5802acc012bbb4bce4dff92973dff76482f30ef35dd4cb8ab5b0e06aa8f08c80", - "sha256:5e63146dbdb1eac207464f6e0cfcdb640c9c5ff0f57b754fa96fe252314a1dc6", - "sha256:6695d7136a435c1305b261a9ddb9b3ecec9863e05aab3935b96038145fd3a977", - "sha256:680fa0fc719e1a3dcb81130858368f51d83667d431924d0bcf249644bce8f303", - "sha256:6b18276f14b4b6d92e707ab6db19b938e112bd2f1dc3f9f1a628df58e4fd3f0d", - "sha256:6bafea6061d63059d8bc2ffc545e2f049221c8a4457d236c5cd6a66678673eab", - "sha256:6d6a1b1361f118e7fefa17ae3114e77f10ee1b228b20d50c47c9f351346180c8", - "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34", - "sha256:79f41576b3022c2fe9780ae3e44202b2438128a25284a8ddfa038f0785d87019", - "sha256:7b0e6361754ac596cd16bfc6ed49f69ffcd9b60b7bc4bcd3ea65c6a83475e4ff", - "sha256:7e3b0127b260d4abae7b62203c4c7ef0874c901b55155692353db19de4b18bc4", - "sha256:7fc2bb8a74dcfcdd32f89528e38dcbf70a3a6594963d60dc9595e3b35b66e414", - "sha256:806e094e9e85d8badc978af8c95b69c556077f11844655cb8cd2d1758769e521", - "sha256:81dd1308bd5630d2bb5980f00aa163b986b133f1e9ed66c66ce2a5bc3572e891", - "sha256:82e620842e12e8cb4050d2643a81c8149361cd82c0a920fa5a15dc4ca8a4000f", - "sha256:85f2cdc400ee87f5952ebf2a117488f2525a3fb2e23863a8efe3e4ee9e54e4d1", - "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8", - "sha256:8adf014f2779992eba3b513e060d06f075f0ab2fb3ad956f413a102312f65cdf", - "sha256:9b0f98481ad5dc4cb430a60bbb8869f05505283b9ae1c62bdb65eb5e020ee8e3", - "sha256:9bea9138b0fc6e2218147e9c6ce1ff76ff8e29dc00bb1b64842bd1ca107aee9f", - "sha256:a09bfb51953930e7e838972ddf646c5d5f984992a66d79da6ba7f6a8d8a890cd", - "sha256:a0be99b599da95b7a90a918dd927b20c434bea5e1c9b3efc6a3c6cd67c23f813", - "sha256:a49aca4d961823b2846b739380c847e8964ff7ae0f0a683992b9d926054f0d6d", - "sha256:a4dc1319d0c162919ee7f4ee6face076becae2abbd351cc14f1fe70af5fb20d9", - "sha256:a8273e1abbcff1d7d29cbbb1ea7e57d38be72f1af3c597c854168508b91516c2", - "sha256:a8f7f9feecae53fa18d6a3ea7c75f9e9a1d4d20e5c3f9ce3fba83f07bcc4eee2", - "sha256:ad4f66fbb893b55f96f03020e67dcab49ffde0177c6565ccf9dec4fdf974eb61", - "sha256:af425f323fce1b07755edd783581e7283557296946212f5b1a934441718e7528", - "sha256:b14dd73f595199f4275bed4fb509277470d9b9059310537e3b3daba12b30c157", - "sha256:b4ad70d7cac4ca0c7b31444a0148bd3af01a2662fa12b1ad6f57cd4a04e21766", - "sha256:b80a4ee19b3442c57c38afa978adca546521a8822d663310b63ae2a7d7b13f3a", - "sha256:ba51129fcc510824b6ca6e2ce1c27e3e4d048b6e35d3ae6f7e517bed1b8b25ce", - "sha256:c011bd5ad03cc096f99ffcfdd18a1817354132c1331bed7a837a25226659845f", - "sha256:cc94f9fea17a5af8cf1a343597711a26b0117c0b812550d99934acb89d526ed2", - "sha256:ccd785fafa1c931deff6a7116e9a0d402d59fabe51644b0d0c268295ff847b25", - "sha256:d16a534da0e39785687b7295e2fcf9a339f4a20689024983d11afaa4657f8507", - "sha256:d3077a31633beef77d057c6523f5de7271ddef7bde5e019285b00c0cc9cac1e3", - "sha256:d603edea1ff7408638b2504905c032193b7dcee7af269802dbb35bc8c3310ed5", - "sha256:db082f728160369d9a6ed2e722438291558fc15ce06d0a7d696a8dad735c236b", - "sha256:ddef295aaf80cefb0c1606f1995899efcb17edc6b327eb6589e234e614b87756", - "sha256:e16ade71c93f6814d095d25cd6d28a90d63511ea396bd96e9ffcb886b278baaa", - "sha256:e3db7d833a7c38c317dc95b54e27f1d27012e031b45a7c24e360b53197d5f6e7", - "sha256:e5e193f89f4f8c1fe273f9a6e6df915092c9f2af6db2d1afb8bd53855025c11f", - "sha256:eb438a8bf6b695bf50d57e6a059ff09652a07968b2041178b3744ea785fcef9b", - "sha256:ebf02c32afa6b67e5861a27183dd98ed88419a94a2ab843cc145fb0bafcc5b28", - "sha256:ecd9e1fa97aa11bf67472220285775fa15e896da108f425e55d23d7540a712ce", - "sha256:ef67fedd863ffffd4adfd46d9d992b0f929c7f61a8307366d664d93517f2c78e", - "sha256:f28ae33dc5a0b9cee06e95fd420e42155d83271ab75964baf747ce959cac5f52", - "sha256:fb1c56d891f9e34303c451998ef62ba52659648bb0d75b03c5e4ac223a3342c2", - "sha256:fe03bf25fae4b95d8afe40004a321df644400fdcba4c8e5e1a19c1085b740888" + "sha256:00c199e1c593e2c8b033136d7a08f0c376452bac8a896c923fcd6f419e07bdd2", + "sha256:010bc9aa90fd06e5cc52c8fac2c2fd4ef1b5f990d9638548dde178005770a5e8", + "sha256:026a24a36394dc8930cbcb1d19d5eb35205ef3c838a7e619e04bd170713972e7", + "sha256:061598cbc6abe2f382ab64c9caa83faa2f4c51256f732cdd890bcc6e63bfb67e", + "sha256:13552ca505366df74e3e2f0a4f27c363928f3dff0eef9f281eb81af7f29bc3c5", + "sha256:13d613c866f9f07d51180f9a7da54ef491d130f169e999c27e7633abe8619ec9", + "sha256:144a31391a39a390efce0c5ebcaf4bf112114af4384c90163f402cec5ede476b", + "sha256:1461199b07903fc1424709efafe379205bf5f738144b1a50a08b0396357b5abf", + "sha256:154b361dcb358ad377d5d40df41ee35f1cc14c8691b50511547c12404f89b5cb", + "sha256:1c5654bb8bb2bdb10e7a0bc3c193dd8b49a960b9eebc4381ff5a2043f4c3c441", + "sha256:1de3c6faf948f3edd4e738abdb4b76572b4f4fdfc1fed4dad02427e70c5a6219", + "sha256:1ed23b0e2dac6f84f44c8494fbceefe6eb5c35db5c1099f56ab78fc0d94ab3af", + "sha256:1f2b856518bfcfa316c8dae3d7b412aecacf2e8ba30b149f5eb3b63128d703b9", + "sha256:2346450a075625c4d6166b40a013b605a38b6b6168ce2232b192a37fb200d588", + "sha256:262356ea5fcb13d35fb2ab6009d3927bafb9504ef02339338634fffd8a9f1ae4", + "sha256:27b81ecf18031998ad7db53b960d1347f8f29e8b7cb5ea7b4394726468e4295e", + "sha256:2940aa20e9cc328e8ddeacea8b9a6f5ddafe0b087fedad928912e787c65b4909", + "sha256:2d4ccac3053b84a09251da8f5350bb684cbbf8c8c01eda6b5418417d0a8ab198", + "sha256:2dd2f6960ee3c9360bed7fb3c678be0ca2d00f877068556785ec2eb6b73d2414", + "sha256:3071ec998cc3d7b4944377e5f1217c2c44b811fae16f9a495c7a1ce9b42fb038", + "sha256:3094c7d2f820eecabadae76bfec02669567bbdd1730eabce10a5764778564f7b", + "sha256:30b2c9caf3e55c2e323565d1f3b7e7881ab87db16997dc0cbca7c52885ed2347", + "sha256:3177f783ae7e08aaf7b2802e0df4e4b13903520e8380915e6337cdc7a6ff01d8", + "sha256:31dab1f3e1d0cdd57e8df01b645f52d43cc1b653ed3afd535d2891f4fc4f9712", + "sha256:33bb16a07d3cc4e0aea37b242097cd5f7a156312012455c2fa8ca396953b11c4", + "sha256:349093675a2d3759e4fb42b596afffa2b2518c890492563d7905fac503b20daa", + "sha256:39d77d8bbb392fa443831e6d4ae534237b1f4eee6aa186f0cdb4e334ba89536e", + "sha256:3a7f02a58a0c2912734105e05dedbee4f7507e6f1bd132ebad520be0b11d46fd", + "sha256:3b287e814a01deddb59b88549c1e0c87cefacd798d4afc0c8bd6042d1c3d48aa", + "sha256:3c74f4725485f0a7a3862cfd374cc1b740cebe4c133e0c1425984bcdcce0f4bb", + "sha256:3cadf7f4c8e94d8a77874b54a63c80af01f4d48c4b669c8b6867f86a07ba994f", + "sha256:3d18a9b9b858ee140c15c5bfcb3e66e47e2a70a03272c2e72adda2482f76a6ad", + "sha256:3f0e6a6c807fa887a0c51cc24fe7ea51bb9e496fe88f00d7930063372c3664c3", + "sha256:4344c30025210b9fa80ec257b0e0aab5aa1d5cca91daa70d82ab97b482cc038e", + "sha256:4497d49d785482cc1a44a0ddf8830b036a468c088e72a05217f5b60a9e025012", + "sha256:547dc5d7f834b1deefda51aedb11a7af9c51c45e689e44e14aa85d44147c7657", + "sha256:5556e306713e2522e460287615d26c0af0fe5ed9d4f431dad35c6624c5d277e9", + "sha256:55dac73316e7e8c2616ba2e6f62b750918e9e0ae0b2053699d66ca27a7790105", + "sha256:56816e43c92c2fa8c11dc2a686f0ca248bea7902f4a067fa6cbc77853b0f041e", + "sha256:5bd94c503271e79917b27c6e77f7c5474da6930b3fb9e70a12e68c2dff386b9a", + "sha256:5ec31adc2e988fd7db3ab509954791bbc5a452a03c85e45b804b4bfc31fa221d", + "sha256:69247f7a2835fc0984bbf0892e6022e9a36aec70e187fcfe6cae6a373eb8c4de", + "sha256:6a0ae7a48a6ef82ceb98a366948874834b86c84e288dbd55600c1abfc3ac1d88", + "sha256:6a1810c2cbde714decf40f811d1edc0dae45506eb37298fd9d4247b8801509fe", + "sha256:76013fef1c9cd1cd00d55efde516c154aa169f2bf059b197c263a255ba8a9ddf", + "sha256:77e0df59b1a4994ad30c6d746992ae887f9756a43fc25dec2db515d94cf0222d", + "sha256:7bb0e9049e81def6829d09558ad12d16d0454c26cabe6efc3658e544460688d9", + "sha256:88beb444fb438385e53dc9110852910ec2a22f0eab7dd489e827038fdc19ed8d", + "sha256:8b47ebd89e69fbf33d1c2df79759d7162fc80c7652dacfec136dae1c9b3afac7", + "sha256:8d219b4508f71d762368caec1fc180960569766049bbc4d38174f05e8ef2fe5b", + "sha256:8ec75f35f62571a43e31e7bd11749d974c1b5cd5ea4a8388725d579263c0fdf6", + "sha256:9167e735379ec43d8eafa3fd675bfbb12e2c0464f98960586e9447d2cf2c7a83", + "sha256:9a710c184ba845afb05a6f876edac8f27783ba70e52d5eaf939f121fc13b2f59", + "sha256:9aafd036f6f2e5ad109aec92f8dbfcbe76cff16bad683eb6dd18013739c0b3ae", + "sha256:9c79d597fb3a7c93d7c26924db7497eba06d58f88f58e586aa69b2ad89fee0f8", + "sha256:a2831e05ce0a4df10c4ac5399ef50b9a621f90894c2a4d2945dc5658765514ed", + "sha256:a5e641f931c5cd95b376fd3c59db52770e17bec2bf86ef16cc83b3906c054845", + "sha256:b10d8cda9fc2fcdcfa4a000aa10413a2bf8b575852cd07cb8a595ed09689ca98", + "sha256:b435b13bb8e36be11b75f7384a34eefe487fe87a6267172964628e2b14ecf0a7", + "sha256:b7b1a83ce514700276a46af3d9e481ec381f05b64939effc9065afe18456a6b9", + "sha256:b8729dbf25eb32ad0dc0b9bd5e6a0d0b7e5c2dc8ec06ad171088e1896b522a74", + "sha256:bbed8cccebe1169d45cedf00461b2842652d476d2897fd1c42cf41b635d88746", + "sha256:c258dbacfff1224f13576147df16ce3c02024a0d792fd0323ac01bed5d3c545d", + "sha256:c30a9e06041fbd7a7590693ec5e407aa8737ad91912a1e70176aff92e5c99d20", + "sha256:c91ea3915425bd4111cb1b74511cdc56d1d16a683a48bf2a5a96b6a6c0f297f7", + "sha256:d0355cff58a4ed6d5e5f6b9c3693f52de0784aa0c17119394e2a8e376ce489d4", + "sha256:d483793a384c550c2d12cb794ede294d303b42beff75f3b3081f57196660edaf", + "sha256:d4c2be9760b112b1caf649b4977b81b69893d75aa86caf4f0f398447be871f3c", + "sha256:d8e62d06e90f60ea2a3d463ae51401475568b995bafaffd81767d208d84d7bb1", + "sha256:da08ea09eefa6b960c2dd9a68ec47949235485c623621eb1d6c02b46765322ac", + "sha256:dd1fa413f8b9ba30140de198e4f408ffbba6396864c7554e0867aa7363eb58b2", + "sha256:e2aced6fb2f5261b47d267cb40060b73b6527e64afe54f6497844c9affed5fd0", + "sha256:e438417ce1dc5b758742e12661d800482200b042d03512a8f31f6aaa9137ad40", + "sha256:e470fa4bace5f50076c32f4b3cc182b31303b4fefb9b87f990144515d572820b", + "sha256:eaf2f65190c506def2581219572b9c70b8250615dc918b3b7c218361a51ec42e", + "sha256:ef102a67ede70e1721fe27f75073b5314911dbb9bc27cde0a1c402a11531e7bd", + "sha256:ef801027629c5b511cf2ba13b9be29bfee36ae834b2d95d9877818479cdc99ea", + "sha256:f7acc03a4f1154ba2643edeb13658d08598fe6e490c3dd96a241b94f09801626", + "sha256:f9756f1d25454ba6a3c2f1ef8b7ddec23e5cdeae3dc3c3377243ae37a383db00", + "sha256:ff62ba8ff70f01ab4fe0ae36b2cb0b5d1f42e73dfc81ddf0758cd9f77331ad25", + "sha256:ff925f1cca42e933376d09ddc254598f8c5fcd36efc5cac0118bb36c36217c41" ], "index": "pypi", "markers": "python_version >= '3.7'", - "version": "==4.6.0" + "version": "==4.6.1" }, "pysocks": { "hashes": [ @@ -262,11 +262,11 @@ "socks" ], "hashes": [ - "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84", - "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e" + "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3", + "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54" ], - "markers": "python_version >= '3.7'", - "version": "==2.0.7" + "markers": "python_version >= '3.8'", + "version": "==2.1.0" }, "wsproto": { "hashes": [ diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt index defce39..e0430c4 100644 --- a/scrapers/requirements.txt +++ b/scrapers/requirements.txt @@ -2,13 +2,13 @@ attrs==23.1.0; python_version >= '3.7' beautifulsoup4==4.12.2; python_full_version >= '3.6.0' bs4==0.0.1 -certifi==2023.7.22; python_version >= '3.6' +certifi==2023.11.17; python_version >= '3.6' click==8.1.7; python_version >= '3.7' dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0' h11==0.14.0; python_version >= '3.7' -idna==3.4; python_version >= '3.5' +idna==3.6; python_version >= '3.5' outcome==1.3.0.post0; python_version >= '3.7' -pymongo==4.6.0; python_version >= '3.7' +pymongo==4.6.1; python_version >= '3.7' pysocks==1.7.1 python-dotenv==1.0.0; python_version >= '3.8' selenium==4.15.2; python_version >= '3.8' @@ -19,5 +19,5 @@ trio==0.23.1; python_version >= '3.8' trio-websocket==0.11.1; python_version >= '3.7' typer==0.9.0; python_version >= '3.6' typing-extensions==4.8.0; python_version >= '3.8' -urllib3[socks]==2.0.7; python_version >= '3.7' +urllib3[socks]==2.1.0; python_version >= '3.8' wsproto==1.2.0; python_full_version >= '3.7.0' diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index 284caeb..36f61e0 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -1,4 +1,5 @@ import typer +import re from src import utils from src import craigslist as cl from src import facebook as fb @@ -19,10 +20,13 @@ def facebook(event, context): @app.command() def link(link: str): - if (".craigslist.org" in link): + clPattern = re.compile(r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$") + fbPattern = re.compile(r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$") + + if (clPattern.match(link)): newInfo = cl.scrapeListing(link) db.update(link, newInfo) - elif("https://www.facebook.com/marketplace" in link): + elif(fbPattern.match(link)): newInfo = fb.scrapeListing(link) print(newInfo) else: From cd2af8c1f04a6ff674cf808cf4af05324979654d Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 30 Nov 2023 00:41:28 -0600 Subject: [PATCH 42/48] added new lines to fix linting erros --- scrapers/scrapers.py | 2 +- scrapers/src/craigslist.py | 2 +- scrapers/src/database.py | 1 + scrapers/src/facebook.py | 2 +- scrapers/src/utils.py | 3 +-- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index 36f61e0..deb437e 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -33,4 +33,4 @@ def link(link: str): print("Not a Craigslist nor a Facebook Marketplace link") if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py index d08322c..64ee544 100644 --- a/scrapers/src/craigslist.py +++ b/scrapers/src/craigslist.py @@ -101,4 +101,4 @@ def scrapeListing(url): print(f"Failed scraping {url}") # Close the Selenium WebDriver instance - browser.quit() \ No newline at end of file + browser.quit() diff --git a/scrapers/src/database.py b/scrapers/src/database.py index 834c14c..5cbd3a2 100644 --- a/scrapers/src/database.py +++ b/scrapers/src/database.py @@ -2,6 +2,7 @@ import pymongo import os from datetime import date + db = "scrape" collection = "scraped_raw" diff --git a/scrapers/src/facebook.py b/scrapers/src/facebook.py index 291c7ec..c40694e 100644 --- a/scrapers/src/facebook.py +++ b/scrapers/src/facebook.py @@ -111,4 +111,4 @@ def scrapeListing(url): return -1 # Close the Selenium WebDriver instance - browser.quit() \ No newline at end of file + browser.quit() diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py index 6d51bbe..8651c74 100644 --- a/scrapers/src/utils.py +++ b/scrapers/src/utils.py @@ -1,7 +1,6 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options from . import database as db - from . import craigslist from . import facebook @@ -65,4 +64,4 @@ def scrape(website, scraperVersion): except Exception as error: print(error) - browser.quit() \ No newline at end of file + browser.quit() From 6ea8e12c05c8db68494f5c39eae3d3f8d41b09e5 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 30 Nov 2023 01:05:21 -0600 Subject: [PATCH 43/48] fixed import order and spacing for isort --- scrapers/scrapers.py | 8 +++++--- scrapers/src/craigslist.py | 5 ++++- scrapers/src/database.py | 5 +++-- scrapers/src/facebook.py | 4 +++- scrapers/src/utils.py | 4 +++- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index deb437e..c4f4996 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -1,9 +1,11 @@ -import typer import re -from src import utils + +import typer + from src import craigslist as cl -from src import facebook as fb from src import database as db +from src import facebook as fb +from src import utils app = typer.Typer() diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py index 64ee544..d4b3358 100644 --- a/scrapers/src/craigslist.py +++ b/scrapers/src/craigslist.py @@ -1,7 +1,10 @@ -from bs4 import BeautifulSoup import time + +from bs4 import BeautifulSoup + from . import utils + def loadPageResources(driver): scroll = 100 diff --git a/scrapers/src/database.py b/scrapers/src/database.py index 5cbd3a2..8143c2a 100644 --- a/scrapers/src/database.py +++ b/scrapers/src/database.py @@ -1,8 +1,9 @@ -from dotenv import load_dotenv -import pymongo import os from datetime import date +import pymongo +from dotenv import load_dotenv + db = "scrape" collection = "scraped_raw" diff --git a/scrapers/src/facebook.py b/scrapers/src/facebook.py index c40694e..31752d8 100644 --- a/scrapers/src/facebook.py +++ b/scrapers/src/facebook.py @@ -1,5 +1,7 @@ -from bs4 import BeautifulSoup import time + +from bs4 import BeautifulSoup + from . import utils postClass = "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24" diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py index 8651c74..17dbaef 100644 --- a/scrapers/src/utils.py +++ b/scrapers/src/utils.py @@ -1,9 +1,11 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options -from . import database as db + from . import craigslist +from . import database as db from . import facebook + def scrollTo(x, driver): driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") From a8ab8b70fadcc5e0ce8fc591fc7c79bb4aeb982d Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 30 Nov 2023 01:39:47 -0600 Subject: [PATCH 44/48] fixed flake8 errors with black linter --- scrapers/.flake8 | 2 + scrapers/scrapers.py | 34 ++++--- scrapers/src/craigslist.py | 189 +++++++++++++++++++++-------------- scrapers/src/database.py | 140 ++++++++++++++------------ scrapers/src/facebook.py | 198 ++++++++++++++++++++++--------------- scrapers/src/utils.py | 114 ++++++++++++--------- 6 files changed, 399 insertions(+), 278 deletions(-) create mode 100644 scrapers/.flake8 diff --git a/scrapers/.flake8 b/scrapers/.flake8 new file mode 100644 index 0000000..79a16af --- /dev/null +++ b/scrapers/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 \ No newline at end of file diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index c4f4996..5d93a24 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -12,27 +12,35 @@ craigslistScraperVersion = 1 facebookScraperVersion = 1 + @app.command() def craigslist(event, context): - utils.scrape("craigslist", craigslistScraperVersion) + utils.scrape("craigslist", craigslistScraperVersion) + @app.command() def facebook(event, context): - utils.scrape("facebook", facebookScraperVersion) + utils.scrape("facebook", facebookScraperVersion) + @app.command() def link(link: str): - clPattern = re.compile(r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$") - fbPattern = re.compile(r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$") - - if (clPattern.match(link)): - newInfo = cl.scrapeListing(link) - db.update(link, newInfo) - elif(fbPattern.match(link)): - newInfo = fb.scrapeListing(link) - print(newInfo) - else: - print("Not a Craigslist nor a Facebook Marketplace link") + clPattern = re.compile( + r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$" + ) + fbPattern = re.compile( + r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$" + ) + + if clPattern.match(link): + newInfo = cl.scrapeListing(link) + db.update(link, newInfo) + elif fbPattern.match(link): + newInfo = fb.scrapeListing(link) + print(newInfo) + else: + print("Not a Craigslist nor a Facebook Marketplace link") + if __name__ == "__main__": app() diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py index d4b3358..13e30b9 100644 --- a/scrapers/src/craigslist.py +++ b/scrapers/src/craigslist.py @@ -6,102 +6,143 @@ def loadPageResources(driver): - scroll = 100 + scroll = 100 - print("Waiting to load...") - time.sleep(2) + print("Waiting to load...") + time.sleep(2) - utils.scrollTo(scroll, driver) + utils.scrollTo(scroll, driver) - loadImgButtons = driver.find_elements("class name", "slider-back-arrow") + loadImgButtons = driver.find_elements("class name", "slider-back-arrow") - time.sleep(2) + time.sleep(2) - # Emulate a user scrolling - for i in range(len(loadImgButtons)): - scroll += 100 - utils.scrollTo(scroll, driver) + # Emulate a user scrolling + for i in range(len(loadImgButtons)): + scroll += 100 + utils.scrollTo(scroll, driver) - utils.clickOn(loadImgButtons[i], driver) + utils.clickOn(loadImgButtons[i], driver) - time.sleep(.5) + time.sleep(0.5) def setupURLs(oldestAllowedCars): - # List of TX cities to scrape; can be expanded - cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"] + # List of TX cities to scrape; can be expanded + cities = [ + "abilene", + "amarillo", + "austin", + "beaumont", + "brownsville", + "collegestation", + "corpuschristi", + "dallas", + "nacogdoches", + "delrio", + "elpaso", + "galveston", + "houston", + "killeen", + "laredo", + "lubbock", + "mcallen", + "odessa", + "sanangelo", + "sanantonio", + "sanmarcos", + "bigbend", + "texoma", + "easttexas", + "victoriatx", + "waco", + "wichitafalls", + ] + + # Set the URL of the Facebook Marketplace automotive category + base_url = ( + "https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0" + ) + return [base_url.format(city, oldestAllowedCars) for city in cities] - # Set the URL of the Facebook Marketplace automotive category - base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0' - return [base_url.format(city, oldestAllowedCars) for city in cities] def getAllPosts(browser): - # Create a BeautifulSoup object from the HTML of the page - html = browser.page_source - soup = BeautifulSoup(html, 'html.parser') + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, "html.parser") + + # Find all of the car listings on the page + return soup.find_all("div", class_="gallery-card") - # Find all of the car listings on the page - return soup.find_all('div', class_='gallery-card') def getCarInfo(post): - title = post.find('span', class_='label').text + title = post.find("span", class_="label").text + + print(f'Scraping "{title}"') + + price = post.find("span", class_="priceinfo").text + metadata = post.find("div", class_="meta").text.split("·") - print(f'Scraping "{title}"') + odometer = metadata[1] + if len(metadata) >= 3: + location = metadata[2] - price = post.find('span', class_='priceinfo').text - metadata = post.find('div', class_="meta").text.split('·') + link = post.find("a", class_="posting-title", href=True)["href"] - odometer = metadata[1] - if (len(metadata) >= 3): - location = metadata[2] - - link = post.find('a', class_='posting-title', href=True)["href"] - - imageElements = post.findAll('img') - images = [img["src"] for img in imageElements] + imageElements = post.findAll("img") + images = [img["src"] for img in imageElements] + + return title, price, location, odometer, link, images - return title, price, location, odometer, link, images def processAttributes(attributes): - processedAttributes = [] - - for attr in attributes: - [label, value] = attr.split(": ") - processedAttributes.append({"label": label.replace(" ", "-").lower(), "value": value}) + processedAttributes = [] + + for attr in attributes: + [label, value] = attr.split(": ") + processedAttributes.append( + {"label": label.replace(" ", "-").lower(), "value": value} + ) + + return processedAttributes - return processedAttributes def scrapeListing(url): - browser = utils.setupBrowser() - - # Navigate to the URL - print(f"Going to {url}") - browser.get(url) - - print(f"Loading page for {url}") - time.sleep(1) - - # Create a BeautifulSoup object from the HTML of the page - html = browser.page_source - soup = BeautifulSoup(html, 'html.parser') - - try: - description = soup.find('section', id='postingbody').text - attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) - map = soup.find('div', id='map') - - car = { - "postBody": description, - "longitude": map["data-longitude"], - "latitude": map["data-latitude"] - } - - for attr in attributes: - car[attr["label"]] = attr["value"] - - return car - except: - print(f"Failed scraping {url}") - - # Close the Selenium WebDriver instance - browser.quit() + browser = utils.setupBrowser() + + # Navigate to the URL + print(f"Going to {url}") + browser.get(url) + + print(f"Loading page for {url}") + time.sleep(1) + + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, "html.parser") + + try: + description = soup.find("section", id="postingbody").text + attributes = processAttributes( + [ + attr.text + for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span") + ] + ) + map = soup.find("div", id="map") + + car = { + "postBody": description, + "longitude": map["data-longitude"], + "latitude": map["data-latitude"], + } + + for attr in attributes: + car[attr["label"]] = attr["value"] + + return car + except Exception as e: + print(f"Failed scraping {url}: \n{e}") + + # Close the Selenium WebDriver instance + browser.quit() diff --git a/scrapers/src/database.py b/scrapers/src/database.py index 8143c2a..1c88567 100644 --- a/scrapers/src/database.py +++ b/scrapers/src/database.py @@ -7,69 +7,83 @@ db = "scrape" collection = "scraped_raw" + def get_conn(db): - # load environment variable containing db uri (which includes username and password) - load_dotenv() - db_uri = os.environ.get("DB_URI") - - # create a mongodb connection - try: - client = pymongo.MongoClient(db_uri) - - # return a friendly error if a URI error is thrown - except pymongo.errors.ConfigurationError: - print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?") - return {"success" : False, "db": 0} - - return {"success" : True, "db": client.get_database(db)} - -def post_raw(scraperVersion, source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None): - car = { - "_id": link, - "source": source, - "scraper-version": scraperVersion, - "scrape-date": str(date.today()), - "title": title, - "price": price, - "location": location, - "odometer": miles, - "link": link - } - - if (images is not None): - car["images"] = images - - if (postBody is not None): - car["postBody"] = postBody - - if (longitude is not None): - car["longitude"] = longitude - - if (latitude is not None): - car["latitude"] = latitude - - if (attributes is not None): - for attr in attributes: - car[attr["label"]] = attr["value"] - - # Insert into collection called "scrape_raw" - conn = get_conn(db) - - if (conn["success"]): - result = conn["db"][collection].insert_one(car) - return result.acknowledged - else: - return False + # load environment variable containing db uri (which includes username and password) + load_dotenv() + db_uri = os.environ.get("DB_URI") + + # create a mongodb connection + try: + client = pymongo.MongoClient(db_uri) + + # return a friendly error if a URI error is thrown + except pymongo.errors.ConfigurationError: + print( + "An Invalid URI host error was received." + " Is your Atlas host name correct in your connection string (found the .env)?" + ) + return {"success": False, "db": 0} + + return {"success": True, "db": client.get_database(db)} + + +def post_raw( + scraperVersion, + source, + title, + price, + location, + miles, + link, + images=None, + postBody=None, + longitude=None, + latitude=None, + attributes=None, +): + car = { + "_id": link, + "source": source, + "scraper-version": scraperVersion, + "scrape-date": str(date.today()), + "title": title, + "price": price, + "location": location, + "odometer": miles, + "link": link, + } + + if images is not None: + car["images"] = images + + if postBody is not None: + car["postBody"] = postBody + + if longitude is not None: + car["longitude"] = longitude + + if latitude is not None: + car["latitude"] = latitude + + if attributes is not None: + for attr in attributes: + car[attr["label"]] = attr["value"] + + # Insert into collection called "scrape_raw" + conn = get_conn(db) + + if conn["success"]: + result = conn["db"][collection].insert_one(car) + return result.acknowledged + else: + return False + def update(link, newFields): - conn = get_conn(db) - if (conn["success"]): - result = conn["db"][collection].update_one( - {'_id': link}, - { - '$set': newFields - } - ) - return result.acknowledged - else: - return False + conn = get_conn(db) + if conn["success"]: + result = conn["db"][collection].update_one({"_id": link}, {"$set": newFields}) + return result.acknowledged + else: + return False diff --git a/scrapers/src/facebook.py b/scrapers/src/facebook.py index 31752d8..61aef28 100644 --- a/scrapers/src/facebook.py +++ b/scrapers/src/facebook.py @@ -4,113 +4,153 @@ from . import utils -postClass = "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24" -linkClass = "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv" +postClass = ( + "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4" + " x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24" +) +linkClass = ( + "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619" + "x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r" + " xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq" + " x1a2a7pz x1heor9g x1lku1pv" +) thumbnailClass = "xt7dq6l xl1xv1r x6ikm8r x10wlt62 xh8yej3" titleClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6" -priceClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7 x1s688f xzsf02u" +priceClass = ( + "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel" + " x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i" + " x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7" + " x1s688f xzsf02u" +) metaClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft" listingInfoClass = "x78zum5 xdt5ytf x1iyjqo2 x1n2onr6" listingSectionClass = "xod5an3" -bodyClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u" +bodyClass = ( + "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx" + " x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty" + " x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u" +) + def loadPageResources(driver): - scroll = 100 + scroll = 100 - print("Waiting to load...") - time.sleep(2) - utils.scrollTo(scroll, driver) - time.sleep(1.5) + print("Waiting to load...") + time.sleep(2) + utils.scrollTo(scroll, driver) + time.sleep(1.5) - # Emulate a user scrolling - for i in range(10): - scroll += 1000 - utils.scrollTo(scroll, driver) - time.sleep(1) + # Emulate a user scrolling + for i in range(10): + scroll += 1000 + utils.scrollTo(scroll, driver) + time.sleep(1) def setupURLs(oldestAllowedCars): - # List of TX cities to scrape; can be expanded - cities = ['houston', 'dallas', 'austin', 'fortworth', 'elpaso', 'sanantonio'] + # List of TX cities to scrape; can be expanded + cities = ["houston", "dallas", "austin", "fortworth", "elpaso", "sanantonio"] + + # Set the URL of the Facebook Marketplace automotive category + base_url = "https://www.facebook.com/marketplace/{}/vehicles?minYear={}&exact=false" + return [base_url.format(city, oldestAllowedCars) for city in cities] - # Set the URL of the Facebook Marketplace automotive category - base_url = 'https://www.facebook.com/marketplace/{}/vehicles?minYear={}&exact=false' - return [base_url.format(city, oldestAllowedCars) for city in cities] def getAllPosts(browser): - # Create a BeautifulSoup object from the HTML of the page - html = browser.page_source - soup = BeautifulSoup(html, 'html.parser') + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, "html.parser") + + # Find all of the car listings on the page + return soup.find_all("div", class_=postClass) - # Find all of the car listings on the page - return soup.find_all('div', class_=postClass) def getCarInfo(post): - title = post.find('span', class_=titleClass).text + title = post.find("span", class_=titleClass).text + + print(f'Scraping "{title}"') - print(f'Scraping "{title}"') + price = post.find("span", class_=priceClass).text + metadata = post.findAll("span", class_=metaClass) - price = post.find('span', class_=priceClass).text - metadata = post.findAll('span', class_=metaClass) + location = metadata[0].text + odometer = metadata[1].text - location = metadata[0].text - odometer = metadata[1].text + link = post.find("a", class_=linkClass, href=True)["href"] + link = "https://facebook.com" + link - link = post.find('a', class_=linkClass, href=True)["href"] - link = "https://facebook.com" + link - - thumbnail = post.find('img', class_=thumbnailClass)["src"] + thumbnail = post.find("img", class_=thumbnailClass)["src"] + + return title, price, location, odometer, link, [thumbnail] - return title, price, location, odometer, link, [thumbnail] def getCarImages(): - # class="x1a0syf3 x1ja2u2z" - return "TODO" + # class="x1a0syf3 x1ja2u2z" + return "TODO" + def processAttributes(attributes): - processedAttributes = [] - - for attr in attributes: - [label, value] = attr.split(": ") - processedAttributes.append({"label": label, "value": value}) + processedAttributes = [] + + for attr in attributes: + [label, value] = attr.split(": ") + processedAttributes.append({"label": label, "value": value}) + + return processedAttributes - return processedAttributes def scrapeListing(url): - browser = utils.setupBrowser() - - # Navigate to the URL - print(f"Going to {url[0:60]}") - browser.get(url[0:60]) - - print(f"Loading page for {url[0:60]}") - time.sleep(1) - - # Create a BeautifulSoup object from the HTML of the page - html = browser.page_source - soup = BeautifulSoup(html, 'html.parser') - - try: - seeMoreButton = browser.find_element("class name", "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x6prxxf xvq8zen x1s688f xzsf02u".replace(" ", ".")) - utils.clickOn(seeMoreButton, browser) - - listingInfo = soup.find('div', class_=listingInfoClass) - # description = listingInfo.find('span', class_="x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u") - print(listingInfo) - - return 2 - - # attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')]) - - # map = soup.find('div', id='map') - # longitude = map["data-longitude"] - # latitude = map["data-latitude"] - - # print([attributes, description, longitude, latitude]) - except Exception as error: - print(error) - return -1 - - # Close the Selenium WebDriver instance - browser.quit() + browser = utils.setupBrowser() + + # Navigate to the URL + print(f"Going to {url[0:60]}") + browser.get(url[0:60]) + + print(f"Loading page for {url[0:60]}") + time.sleep(1) + + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, "html.parser") + + try: + seeMoreButton = browser.find_element( + "class name", + "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x6prxxf xvq8zen x1s688f xzsf02u".replace( + " ", "." + ), + ) + utils.clickOn(seeMoreButton, browser) + + listingInfo = soup.find("div", class_=listingInfoClass) + # description = listingInfo.find( + # "span", + # class_=( + # "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq" + # " x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m" + # " x6prxxf xvq8zen xo1l8bm xzsf02u" + # ), + # ) + print(listingInfo) + + return 2 + + # attributes = processAttributes( + # [ + # attr.text + # for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span") + # ] + # ) + + # map = soup.find('div', id='map') + # longitude = map["data-longitude"] + # latitude = map["data-latitude"] + + # print([attributes, description, longitude, latitude]) + except Exception as error: + print(error) + return -1 + + # Close the Selenium WebDriver instance + browser.quit() diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py index 17dbaef..399e64e 100644 --- a/scrapers/src/utils.py +++ b/scrapers/src/utils.py @@ -1,5 +1,4 @@ from selenium import webdriver -from selenium.webdriver.chrome.options import Options from . import craigslist from . import database as db @@ -7,63 +6,80 @@ def scrollTo(x, driver): - driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})") + driver.execute_script( + f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})" + ) + def clickOn(elem, driver): - driver.execute_script("arguments[0].click();", elem) + driver.execute_script("arguments[0].click();", elem) + def createDriverOptions(): - options = webdriver.ChromeOptions() - options.binary_location = '/opt/chrome/chrome' + options = webdriver.ChromeOptions() + options.binary_location = "/opt/chrome/chrome" - options.add_argument("--headless=new") - options.add_argument("--headless=new") - options.add_argument('--no-sandbox') - options.add_argument("--disable-gpu") - options.add_argument("--window-size=1280x1696") - options.add_argument("--single-process") - options.add_argument("--disable-dev-shm-usage") - options.add_argument("--disable-dev-tools") - options.add_argument("--no-zygote") + options.add_argument("--headless=new") + options.add_argument("--headless=new") + options.add_argument("--no-sandbox") + options.add_argument("--disable-gpu") + options.add_argument("--window-size=1280x1696") + options.add_argument("--single-process") + options.add_argument("--disable-dev-shm-usage") + options.add_argument("--disable-dev-tools") + options.add_argument("--no-zygote") + + return options - return options def setupBrowser(): - print("Setting up headless browser") + print("Setting up headless browser") + + service = webdriver.ChromeService("/opt/chromedriver") + options = createDriverOptions() - service = webdriver.ChromeService("/opt/chromedriver") - options = createDriverOptions() + print("Creating a new Selenium WebDriver instance") + return webdriver.Chrome(options=options, service=service) - print("Creating a new Selenium WebDriver instance") - return webdriver.Chrome(options=options, service=service) def scrape(website, scraperVersion): - if (website == 'craigslist'): - scraper = craigslist - elif (website == 'facebook'): - scraper = facebook - - cityURLs = scraper.setupURLs(2011) - browser = setupBrowser() - - for url in cityURLs: - print(f"Going to {url}") - browser.get(url) - - print(f"Loading cars from {url}") - scraper.loadPageResources(browser) - - carPosts = scraper.getAllPosts(browser) - - for post in carPosts: - try: - title, price, location, odometer, link, images = scraper.getCarInfo(post) - success = db.post_raw(scraperVersion, website, title, price, location, odometer, link, images) - if (success): - print("posted to db") - else: - print("failed to post to db") - except Exception as error: - print(error) - - browser.quit() + if website == "craigslist": + scraper = craigslist + elif website == "facebook": + scraper = facebook + + cityURLs = scraper.setupURLs(2011) + browser = setupBrowser() + + for url in cityURLs: + print(f"Going to {url}") + browser.get(url) + + print(f"Loading cars from {url}") + scraper.loadPageResources(browser) + + carPosts = scraper.getAllPosts(browser) + + for post in carPosts: + try: + title, price, location, odometer, link, images = scraper.getCarInfo( + post + ) + success = db.post_raw( + scraperVersion, + website, + title, + price, + location, + odometer, + link, + images, + ) + if success: + print("posted to db") + else: + print("failed to post to db") + except Exception as error: + print(error) + + browser.quit() From eda33a90368da468b4469530bd764c1acb7f893f Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Thu, 30 Nov 2023 01:52:53 -0600 Subject: [PATCH 45/48] added flake8, black, and isort to dev dependencies --- scrapers/Dockerfile | 3 +- scrapers/Pipfile | 3 ++ scrapers/Pipfile.lock | 112 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 115 insertions(+), 3 deletions(-) diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile index 6a2f76a..39359a3 100644 --- a/scrapers/Dockerfile +++ b/scrapers/Dockerfile @@ -3,7 +3,8 @@ RUN yum install -y unzip && \ curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \ curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \ unzip /tmp/chromedriver-linux64.zip -d /opt/ && \ - unzip /tmp/chrome-linux64.zip -d /opt/ + unzip /tmp/chrome-linux64.zip -d /opt/ && \ + yup clean all FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 RUN yum install atk cups-libs gtk3 libXcomposite alsa-lib \ diff --git a/scrapers/Pipfile b/scrapers/Pipfile index 4eeccc8..0ecb354 100644 --- a/scrapers/Pipfile +++ b/scrapers/Pipfile @@ -18,6 +18,9 @@ typer = "*" python-dotenv = "*" [dev-packages] +isort = "*" +black = "*" +flake8 = "*" [requires] python_version = "3.11" diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock index 689e92b..bb2797e 100644 --- a/scrapers/Pipfile.lock +++ b/scrapers/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "11dbba9e7645169d8dd9e6cfe9118716f9e853beec34798dce95b25c651c9695" + "sha256": "716098b2b29f4b98c932bd4554e3953a184fea6603a7d3f17e7bd47179932031" }, "pipfile-spec": 6, "requires": { @@ -277,5 +277,113 @@ "version": "==1.2.0" } }, - "develop": {} + "develop": { + "black": { + "hashes": [ + "sha256:250d7e60f323fcfc8ea6c800d5eba12f7967400eb6c2d21ae85ad31c204fb1f4", + "sha256:2a9acad1451632021ee0d146c8765782a0c3846e0e0ea46659d7c4f89d9b212b", + "sha256:412f56bab20ac85927f3a959230331de5614aecda1ede14b373083f62ec24e6f", + "sha256:421f3e44aa67138ab1b9bfbc22ee3780b22fa5b291e4db8ab7eee95200726b07", + "sha256:45aa1d4675964946e53ab81aeec7a37613c1cb71647b5394779e6efb79d6d187", + "sha256:4c44b7211a3a0570cc097e81135faa5f261264f4dfaa22bd5ee2875a4e773bd6", + "sha256:4c68855825ff432d197229846f971bc4d6666ce90492e5b02013bcaca4d9ab05", + "sha256:5133f5507007ba08d8b7b263c7aa0f931af5ba88a29beacc4b2dc23fcefe9c06", + "sha256:54caaa703227c6e0c87b76326d0862184729a69b73d3b7305b6288e1d830067e", + "sha256:58e5f4d08a205b11800332920e285bd25e1a75c54953e05502052738fe16b3b5", + "sha256:698c1e0d5c43354ec5d6f4d914d0d553a9ada56c85415700b81dc90125aac244", + "sha256:6c1cac07e64433f646a9a838cdc00c9768b3c362805afc3fce341af0e6a9ae9f", + "sha256:760415ccc20f9e8747084169110ef75d545f3b0932ee21368f63ac0fee86b221", + "sha256:7f622b6822f02bfaf2a5cd31fdb7cd86fcf33dab6ced5185c35f5db98260b055", + "sha256:cf57719e581cfd48c4efe28543fea3d139c6b6f1238b3f0102a9c73992cbb479", + "sha256:d136ef5b418c81660ad847efe0e55c58c8208b77a57a28a503a5f345ccf01394", + "sha256:dbea0bb8575c6b6303cc65017b46351dc5953eea5c0a59d7b7e3a2d2f433a911", + "sha256:fc7f6a44d52747e65a02558e1d807c82df1d66ffa80a601862040a43ec2e3142" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==23.11.0" + }, + "click": { + "hashes": [ + "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", + "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de" + ], + "markers": "python_version >= '3.7'", + "version": "==8.1.7" + }, + "flake8": { + "hashes": [ + "sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23", + "sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5" + ], + "index": "pypi", + "markers": "python_full_version >= '3.8.1'", + "version": "==6.1.0" + }, + "isort": { + "hashes": [ + "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504", + "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6" + ], + "index": "pypi", + "markers": "python_full_version >= '3.8.0'", + "version": "==5.12.0" + }, + "mccabe": { + "hashes": [ + "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", + "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e" + ], + "markers": "python_version >= '3.6'", + "version": "==0.7.0" + }, + "mypy-extensions": { + "hashes": [ + "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d", + "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.0" + }, + "packaging": { + "hashes": [ + "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5", + "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7" + ], + "markers": "python_version >= '3.7'", + "version": "==23.2" + }, + "pathspec": { + "hashes": [ + "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20", + "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3" + ], + "markers": "python_version >= '3.7'", + "version": "==0.11.2" + }, + "platformdirs": { + "hashes": [ + "sha256:118c954d7e949b35437270383a3f2531e99dd93cf7ce4dc8340d3356d30f173b", + "sha256:cb633b2bcf10c51af60beb0ab06d2f1d69064b43abf4c185ca6b28865f3f9731" + ], + "markers": "python_version >= '3.7'", + "version": "==4.0.0" + }, + "pycodestyle": { + "hashes": [ + "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f", + "sha256:44fe31000b2d866f2e41841b18528a505fbd7fef9017b04eff4e2648a0fadc67" + ], + "markers": "python_version >= '3.8'", + "version": "==2.11.1" + }, + "pyflakes": { + "hashes": [ + "sha256:4132f6d49cb4dae6819e5379898f2b8cce3c5f23994194c24b77d5da2e36f774", + "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc" + ], + "markers": "python_version >= '3.8'", + "version": "==3.1.0" + } + } } From dc860ece70c8b90a92b7e8a5bea27a78b0d27359 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Fri, 1 Dec 2023 08:12:07 -0600 Subject: [PATCH 46/48] fixed hadolint errors in dockerfile --- scrapers/.hadolint.yaml | 2 ++ scrapers/Dockerfile | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) create mode 100644 scrapers/.hadolint.yaml diff --git a/scrapers/.hadolint.yaml b/scrapers/.hadolint.yaml new file mode 100644 index 0000000..faf3736 --- /dev/null +++ b/scrapers/.hadolint.yaml @@ -0,0 +1,2 @@ +ignored: + - DL3033 \ No newline at end of file diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile index 39359a3..217ca23 100644 --- a/scrapers/Dockerfile +++ b/scrapers/Dockerfile @@ -4,20 +4,22 @@ RUN yum install -y unzip && \ curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \ unzip /tmp/chromedriver-linux64.zip -d /opt/ && \ unzip /tmp/chrome-linux64.zip -d /opt/ && \ - yup clean all + yum clean all FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 RUN yum install atk cups-libs gtk3 libXcomposite alsa-lib \ libXcursor libXdamage libXext libXi libXrandr libXScrnSaver \ libXtst pango at-spi2-atk libXt xorg-x11-server-Xvfb \ - xorg-x11-xauth dbus-glib dbus-glib-devel -y + xorg-x11-xauth dbus-glib dbus-glib-devel -y && \ + yum clean all COPY --from=build /opt/chrome-linux64 /opt/chrome COPY --from=build /opt/chromedriver-linux64 /opt/ +WORKDIR /var/task COPY scrapers.py ./ COPY src ./src COPY requirements.txt ./ -RUN pip install -r requirements.txt +RUN pip install --no-cache-dir -r requirements.txt CMD [ "scrapers.craigslist" ] \ No newline at end of file From d5a6170d0dd9d7932a9385b164e7151706d911dd Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Fri, 1 Dec 2023 08:22:34 -0600 Subject: [PATCH 47/48] Added latest versions of yum packages to dockerfile --- scrapers/.hadolint.yaml | 2 -- scrapers/Dockerfile | 10 +++++----- 2 files changed, 5 insertions(+), 7 deletions(-) delete mode 100644 scrapers/.hadolint.yaml diff --git a/scrapers/.hadolint.yaml b/scrapers/.hadolint.yaml deleted file mode 100644 index faf3736..0000000 --- a/scrapers/.hadolint.yaml +++ /dev/null @@ -1,2 +0,0 @@ -ignored: - - DL3033 \ No newline at end of file diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile index 217ca23..844d302 100644 --- a/scrapers/Dockerfile +++ b/scrapers/Dockerfile @@ -1,5 +1,5 @@ FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build -RUN yum install -y unzip && \ +RUN yum install -y unzip-* && \ curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \ curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \ unzip /tmp/chromedriver-linux64.zip -d /opt/ && \ @@ -7,10 +7,10 @@ RUN yum install -y unzip && \ yum clean all FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 -RUN yum install atk cups-libs gtk3 libXcomposite alsa-lib \ - libXcursor libXdamage libXext libXi libXrandr libXScrnSaver \ - libXtst pango at-spi2-atk libXt xorg-x11-server-Xvfb \ - xorg-x11-xauth dbus-glib dbus-glib-devel -y && \ +RUN yum install atk-* cups-libs-* gtk3-* libXcomposite-* alsa-lib-* \ + libXcursor-* libXdamage-* libXext-* libXi-* libXrandr-* libXScrnSaver-* \ + libXtst-* pango-* at-spi2-atk-* libXt-* xorg-x11-server-Xvfb-* \ + xorg-x11-xauth-* dbus-glib-* dbus-glib-devel-* -y && \ yum clean all COPY --from=build /opt/chrome-linux64 /opt/chrome COPY --from=build /opt/chromedriver-linux64 /opt/ From 3a9e8c871a928313b966f2d8957a22ead8737965 Mon Sep 17 00:00:00 2001 From: waseem-polus Date: Fri, 1 Dec 2023 08:30:16 -0600 Subject: [PATCH 48/48] isort plz T-T --- scrapers/scrapers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py index 5d93a24..90cd562 100644 --- a/scrapers/scrapers.py +++ b/scrapers/scrapers.py @@ -1,7 +1,6 @@ import re import typer - from src import craigslist as cl from src import database as db from src import facebook as fb