From 86eecae74f0dca685336e0c044280f1c1c13f07d Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Wed, 18 Oct 2023 15:29:17 -0500
Subject: [PATCH 01/48] Fixed csv output

---
 scrapers/craigslist.py | 57 ++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py
index cb6ad22..b668f88 100644
--- a/scrapers/craigslist.py
+++ b/scrapers/craigslist.py
@@ -10,6 +10,10 @@
 	# Add more locations and their batch values as needed
 }
 
+def clean_price_str(str):
+	price_str = str.replace("$", "").replace(",", "")
+	return float(price_str)
+
 def fetch_job_postings(location, category):
 	base_url = "https://sapi.craigslist.org/web/v8/postings/search/full"
 
@@ -23,7 +27,7 @@ def fetch_job_postings(location, category):
 		'lang': 'en',
 		'searchPath': "cta",
 		"id": "0",
-  	"collectContactInfo": True,
+  		"collectContactInfo": True,
 	}
 
 	headers = {
@@ -39,27 +43,35 @@ def fetch_job_postings(location, category):
 
 	if response.status_code == 200:
 		data = response.json()
+
+		with open('file.txt', 'w') as f:
+			json.dump(data["data"]["items"], f, indent=2)
 	else:
 		print("Failed to retrieve data. Status code:", response.status_code)
 		data = None
 
-	job_postings = []
-	with open('file.txt', 'w') as f:
-		json.dump(data, f, indent=2)
 
+	car_posts = []
 	if data:
-		for item in data["data"]["items"]:
-			job_title = None
-			commission = None
-			for element in item:
+		# For each car post found
+		for post in data["data"]["items"]:
+			title = None
+			price = None
+			mileage = None
+			partial_link = None
+
+			for element in post:
 				if isinstance(element, str):
-					job_title = element
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 7:
-					commission = element[1]
-			if job_title and commission:
-				job_postings.append((job_title, commission))
-		return job_postings
-							
+					title = element
+				elif isinstance(element, list) and len(element) > 0 and element[0] == 10:
+					price = clean_price_str(element[1])
+				elif isinstance(element, list) and len(element) > 0 and element[0] == 9:
+					mileage = element[1]
+				elif isinstance(element, list) and len(element) > 0 and element[0] == 6:
+					partial_link = element[1]
+			if title and price and mileage and partial_link:
+				car_posts.append((title, price, mileage, partial_link))
+		return car_posts
 	else:
 		print("No data available.")
 
@@ -67,9 +79,10 @@ def fetch_job_postings(location, category):
 	location = "dallas"
 	category = "cta"
 	
-	job_postings = fetch_job_postings(location, category)
+	car_posts = fetch_job_postings(location, category)
 
-	if job_postings:
+	if car_posts:
+		print("we have results")
 		current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
 		category = category.replace("/", "&")
 		csv_filename = f"{location}_{category}_openings_{current_datetime}.csv"
@@ -77,10 +90,10 @@ def fetch_job_postings(location, category):
 		with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
 			writer = csv.writer(file)
 
-			writer.writerow(["Job Title", "Commission"])
-			for job in job_postings:
-				writer.writerow([job[0], job[1]])
+			writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"])
+			for car in car_posts:
+				writer.writerow([car[0], car[1], car[2], car[3]])
 	
-		print(f"Job postings have been saved to {csv_filename}")
+		print(f"Car posts have been saved to {csv_filename}")
 	else:
-		print("No data available.")
\ No newline at end of file
+		print(car_posts)
\ No newline at end of file

From cb9bbeddbb49b7d2030fafdb75e8d045e60389ba Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Wed, 18 Oct 2023 15:30:39 -0500
Subject: [PATCH 02/48] removed debug print statements

---
 scrapers/craigslist.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py
index b668f88..1989d9a 100644
--- a/scrapers/craigslist.py
+++ b/scrapers/craigslist.py
@@ -82,7 +82,6 @@ def fetch_job_postings(location, category):
 	car_posts = fetch_job_postings(location, category)
 
 	if car_posts:
-		print("we have results")
 		current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
 		category = category.replace("/", "&")
 		csv_filename = f"{location}_{category}_openings_{current_datetime}.csv"
@@ -96,4 +95,4 @@ def fetch_job_postings(location, category):
 	
 		print(f"Car posts have been saved to {csv_filename}")
 	else:
-		print(car_posts)
\ No newline at end of file
+		print("No car posts were found. Nothing was saved")
\ No newline at end of file

From 8255e4bc7838dd62eb7b97fc22686eced7e309d5 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 26 Oct 2023 16:47:13 -0500
Subject: [PATCH 03/48] moved scrapers into src dir

---
 {scrapers => src/scrapers}/craigslist.py | 0
 {scrapers => src/scrapers}/facebook.py   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {scrapers => src/scrapers}/craigslist.py (100%)
 rename {scrapers => src/scrapers}/facebook.py (100%)

diff --git a/scrapers/craigslist.py b/src/scrapers/craigslist.py
similarity index 100%
rename from scrapers/craigslist.py
rename to src/scrapers/craigslist.py
diff --git a/scrapers/facebook.py b/src/scrapers/facebook.py
similarity index 100%
rename from scrapers/facebook.py
rename to src/scrapers/facebook.py

From 880cc97e75800cb1a5765010f9d9973aca1d7143 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 26 Oct 2023 16:49:20 -0500
Subject: [PATCH 04/48] renamed craigslist to craigslist-api

---
 src/scrapers/craigslist-api.py | 98 ++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 src/scrapers/craigslist-api.py

diff --git a/src/scrapers/craigslist-api.py b/src/scrapers/craigslist-api.py
new file mode 100644
index 0000000..1989d9a
--- /dev/null
+++ b/src/scrapers/craigslist-api.py
@@ -0,0 +1,98 @@
+from datetime import datetime
+import csv
+import json
+import requests
+
+location_to_batch = {
+	"newyork": "3-0-360-0-0",
+	"philadelphia": "17-0-360-0-0",
+	"dallas": "21-0-360-0-0",
+	# Add more locations and their batch values as needed
+}
+
+def clean_price_str(str):
+	price_str = str.replace("$", "").replace(",", "")
+	return float(price_str)
+
+def fetch_job_postings(location, category):
+	base_url = "https://sapi.craigslist.org/web/v8/postings/search/full"
+
+	# Get the batch value and category abbreviation from the mappings
+	# Default to New York if location not found
+	batch = location_to_batch.get(location)
+
+	params = {
+		'batch': batch,
+		'cc': 'US',
+		'lang': 'en',
+		'searchPath': "cta",
+		"id": "0",
+  		"collectContactInfo": True,
+	}
+
+	headers = {
+		'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+		'Referer': f'https://{location}.craigslist.org/',
+		'sec-ch-ua-mobile': '?0',
+		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+		'sec-ch-ua-platform': '"Windows"',
+		'Cookie': f'cl_b=COOKIE VALUE'
+	}
+
+	response = requests.get(base_url, params=params, headers=headers)
+
+	if response.status_code == 200:
+		data = response.json()
+
+		with open('file.txt', 'w') as f:
+			json.dump(data["data"]["items"], f, indent=2)
+	else:
+		print("Failed to retrieve data. Status code:", response.status_code)
+		data = None
+
+
+	car_posts = []
+	if data:
+		# For each car post found
+		for post in data["data"]["items"]:
+			title = None
+			price = None
+			mileage = None
+			partial_link = None
+
+			for element in post:
+				if isinstance(element, str):
+					title = element
+				elif isinstance(element, list) and len(element) > 0 and element[0] == 10:
+					price = clean_price_str(element[1])
+				elif isinstance(element, list) and len(element) > 0 and element[0] == 9:
+					mileage = element[1]
+				elif isinstance(element, list) and len(element) > 0 and element[0] == 6:
+					partial_link = element[1]
+			if title and price and mileage and partial_link:
+				car_posts.append((title, price, mileage, partial_link))
+		return car_posts
+	else:
+		print("No data available.")
+
+if __name__ == "__main__":
+	location = "dallas"
+	category = "cta"
+	
+	car_posts = fetch_job_postings(location, category)
+
+	if car_posts:
+		current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
+		category = category.replace("/", "&")
+		csv_filename = f"{location}_{category}_openings_{current_datetime}.csv"
+
+		with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
+			writer = csv.writer(file)
+
+			writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"])
+			for car in car_posts:
+				writer.writerow([car[0], car[1], car[2], car[3]])
+	
+		print(f"Car posts have been saved to {csv_filename}")
+	else:
+		print("No car posts were found. Nothing was saved")
\ No newline at end of file

From 58cf7ad512869266238e6f25d34d9961ba60ddea Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 26 Oct 2023 19:24:23 -0500
Subject: [PATCH 05/48] craigslist scraper collects image data

---
 src/scrapers/craigslist.py | 218 ++++++++++++++++++++-----------------
 1 file changed, 120 insertions(+), 98 deletions(-)

diff --git a/src/scrapers/craigslist.py b/src/scrapers/craigslist.py
index 1989d9a..cbd06c5 100644
--- a/src/scrapers/craigslist.py
+++ b/src/scrapers/craigslist.py
@@ -1,98 +1,120 @@
-from datetime import datetime
-import csv
-import json
-import requests
-
-location_to_batch = {
-	"newyork": "3-0-360-0-0",
-	"philadelphia": "17-0-360-0-0",
-	"dallas": "21-0-360-0-0",
-	# Add more locations and their batch values as needed
-}
-
-def clean_price_str(str):
-	price_str = str.replace("$", "").replace(",", "")
-	return float(price_str)
-
-def fetch_job_postings(location, category):
-	base_url = "https://sapi.craigslist.org/web/v8/postings/search/full"
-
-	# Get the batch value and category abbreviation from the mappings
-	# Default to New York if location not found
-	batch = location_to_batch.get(location)
-
-	params = {
-		'batch': batch,
-		'cc': 'US',
-		'lang': 'en',
-		'searchPath': "cta",
-		"id": "0",
-  		"collectContactInfo": True,
-	}
-
-	headers = {
-		'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
-		'Referer': f'https://{location}.craigslist.org/',
-		'sec-ch-ua-mobile': '?0',
-		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
-		'sec-ch-ua-platform': '"Windows"',
-		'Cookie': f'cl_b=COOKIE VALUE'
-	}
-
-	response = requests.get(base_url, params=params, headers=headers)
-
-	if response.status_code == 200:
-		data = response.json()
-
-		with open('file.txt', 'w') as f:
-			json.dump(data["data"]["items"], f, indent=2)
-	else:
-		print("Failed to retrieve data. Status code:", response.status_code)
-		data = None
-
-
-	car_posts = []
-	if data:
-		# For each car post found
-		for post in data["data"]["items"]:
-			title = None
-			price = None
-			mileage = None
-			partial_link = None
-
-			for element in post:
-				if isinstance(element, str):
-					title = element
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 10:
-					price = clean_price_str(element[1])
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 9:
-					mileage = element[1]
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 6:
-					partial_link = element[1]
-			if title and price and mileage and partial_link:
-				car_posts.append((title, price, mileage, partial_link))
-		return car_posts
-	else:
-		print("No data available.")
-
-if __name__ == "__main__":
-	location = "dallas"
-	category = "cta"
-	
-	car_posts = fetch_job_postings(location, category)
-
-	if car_posts:
-		current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
-		category = category.replace("/", "&")
-		csv_filename = f"{location}_{category}_openings_{current_datetime}.csv"
-
-		with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
-			writer = csv.writer(file)
-
-			writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"])
-			for car in car_posts:
-				writer.writerow([car[0], car[1], car[2], car[3]])
-	
-		print(f"Car posts have been saved to {csv_filename}")
-	else:
-		print("No car posts were found. Nothing was saved")
\ No newline at end of file
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.action_chains import ActionChains
+import time
+from datetime import date
+
+def scrollTo(x, driver):
+    driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
+
+def loadPageResources(driver):
+    scroll = 100
+
+    print("Waiting to load...")
+    time.sleep(2)
+
+    scrollTo(scroll, driver)
+
+    loadImgButtons = driver.find_elements("class name", "slider-back-arrow")
+
+    time.sleep(2)
+
+    # Emulate a user scrolling
+    for i in range(len(loadImgButtons)):
+        scroll += 100
+        scrollTo(scroll, driver)
+
+        driver.execute_script("arguments[0].click();", loadImgButtons[i])
+
+        time.sleep(.5)
+
+
+def setupURLs():
+    #list of cities to scrape; can be expanded
+    cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"]
+
+    oldestAllowedCars = 2011
+
+    # Set the URL of the Facebook Marketplace automotive category
+    base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0'
+    return [base_url.format(city, oldestAllowedCars) for city in cities]
+
+def setupBrowser():
+    print("Setting up headless browser")
+
+    options = Options()
+    # options.add_argument("--headless=new")
+
+    print("Creating a new Selenium WebDriver instance")
+    return webdriver.Chrome(options=options)
+
+def getAllPosts(browser):
+    # Create a BeautifulSoup object from the HTML of the page
+    html = browser.page_source
+    soup = BeautifulSoup(browser.page_source, 'html.parser')
+
+    # Find all of the car listings on the page
+    return soup.find_all('div', class_='gallery-card')
+
+def getCarImages():
+    return "TODO"
+
+def scrapeCarInfo(post):
+    title = post.find('span', class_='label').text
+
+    print(f'Scraping "{title}"')
+
+    price = post.find('span', class_='priceinfo').text
+    metadata = post.find('div', class_="meta").text.split('·')
+
+    miles = metadata[1]
+    if (len(metadata) >= 3):
+        location = metadata[2]
+    
+    link = post.find('a', class_='posting-title', href=True)["href"]
+    
+    imageElements = post.findAll('img')
+    images = [img["src"] for img in imageElements]
+
+    return {
+        "title": title, 
+        "price": price, 
+        "location": location, 
+        "miles": miles, 
+        "link": link,
+        "images": images,
+        "scrapeDate": date.today()
+    }
+
+def scrapeCraigslist():
+    cityURLs = setupURLs()
+    browser = setupBrowser()
+
+    # Create a list to store the scraped data
+    print("Started scraping...")
+
+    for url in cityURLs:
+        # Navigate to the URL
+        print(f"Going to {url}")
+        browser.get(url) 
+
+        print(f"Loading cars from {url}")
+
+        loadPageResources(browser)
+
+        carPosts = getAllPosts(browser)
+
+        # Iterate over the listings and scrape the data
+        for post in carPosts:
+            try:
+                car = scrapeCarInfo(post)
+                print(car)
+            except:
+                print("Incomplete listing info")
+                
+    # Close the Selenium WebDriver instance
+    browser.quit()
+
+if (__name__ == "__main__"):
+    scrapeCraigslist()
\ No newline at end of file

From 59455e41c3a3be0a50de7d4dbd9f92cce256e64b Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 26 Oct 2023 20:12:20 -0500
Subject: [PATCH 06/48] nested craigslist homepage and listing scrapers in
 their own folder

---
 src/scrapers/{craigslist.py => craigslist/homepage.py} | 4 ++--
 src/scrapers/craigslist/listing.py                     | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename src/scrapers/{craigslist.py => craigslist/homepage.py} (95%)
 create mode 100644 src/scrapers/craigslist/listing.py

diff --git a/src/scrapers/craigslist.py b/src/scrapers/craigslist/homepage.py
similarity index 95%
rename from src/scrapers/craigslist.py
rename to src/scrapers/craigslist/homepage.py
index cbd06c5..efdd550 100644
--- a/src/scrapers/craigslist.py
+++ b/src/scrapers/craigslist/homepage.py
@@ -87,7 +87,7 @@ def scrapeCarInfo(post):
         "scrapeDate": date.today()
     }
 
-def scrapeCraigslist():
+def scrapeHomepage():
     cityURLs = setupURLs()
     browser = setupBrowser()
 
@@ -117,4 +117,4 @@ def scrapeCraigslist():
     browser.quit()
 
 if (__name__ == "__main__"):
-    scrapeCraigslist()
\ No newline at end of file
+    scrapeHomepage()
\ No newline at end of file
diff --git a/src/scrapers/craigslist/listing.py b/src/scrapers/craigslist/listing.py
new file mode 100644
index 0000000..e69de29

From 403e5798a4b0b66eb50df29b6a8ceb7b62a34d33 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 26 Oct 2023 21:54:00 -0500
Subject: [PATCH 07/48] set selenium in headless mode

---
 src/scrapers/craigslist/homepage.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/scrapers/craigslist/homepage.py b/src/scrapers/craigslist/homepage.py
index efdd550..ad96903 100644
--- a/src/scrapers/craigslist/homepage.py
+++ b/src/scrapers/craigslist/homepage.py
@@ -44,7 +44,7 @@ def setupBrowser():
     print("Setting up headless browser")
 
     options = Options()
-    # options.add_argument("--headless=new")
+    options.add_argument("--headless=new")
 
     print("Creating a new Selenium WebDriver instance")
     return webdriver.Chrome(options=options)
@@ -52,7 +52,7 @@ def setupBrowser():
 def getAllPosts(browser):
     # Create a BeautifulSoup object from the HTML of the page
     html = browser.page_source
-    soup = BeautifulSoup(browser.page_source, 'html.parser')
+    soup = BeautifulSoup(html, 'html.parser')
 
     # Find all of the car listings on the page
     return soup.find_all('div', class_='gallery-card')
@@ -60,7 +60,7 @@ def getAllPosts(browser):
 def getCarImages():
     return "TODO"
 
-def scrapeCarInfo(post):
+def getCarInfo(post):
     title = post.find('span', class_='label').text
 
     print(f'Scraping "{title}"')
@@ -91,9 +91,6 @@ def scrapeHomepage():
     cityURLs = setupURLs()
     browser = setupBrowser()
 
-    # Create a list to store the scraped data
-    print("Started scraping...")
-
     for url in cityURLs:
         # Navigate to the URL
         print(f"Going to {url}")
@@ -108,7 +105,7 @@ def scrapeHomepage():
         # Iterate over the listings and scrape the data
         for post in carPosts:
             try:
-                car = scrapeCarInfo(post)
+                car = getCarInfo(post)
                 print(car)
             except:
                 print("Incomplete listing info")

From 16adc908456a33953dd29552904962f962aa3380 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 26 Oct 2023 22:09:03 -0500
Subject: [PATCH 08/48] can scrape description and attributes of craigslist
 listing

---
 src/scrapers/craigslist/listing.py | 39 ++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/scrapers/craigslist/listing.py b/src/scrapers/craigslist/listing.py
index e69de29..4524184 100644
--- a/src/scrapers/craigslist/listing.py
+++ b/src/scrapers/craigslist/listing.py
@@ -0,0 +1,39 @@
+import time
+from bs4 import BeautifulSoup
+from homepage import setupBrowser
+
+def processAttributes(attributes):
+	processedAttributes = []
+	
+	for attr in attributes:
+		[label, value] = attr.split(": ")
+		processedAttributes.append({"label": label, "value": value})
+
+	return processedAttributes
+
+def scrapeListing(url):
+	browser = setupBrowser()
+
+	# Navigate to the URL
+	print(f"Going to {url}")
+	browser.get(url) 
+
+	print(f"Loading page for {url}")
+	time.sleep(1)
+
+	# Create a BeautifulSoup object from the HTML of the page
+	html = browser.page_source
+	soup = BeautifulSoup(html, 'html.parser')
+
+	try:
+		description = soup.find('section', id='postingbody').text
+		attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
+		
+		print([attributes, description])
+	except:
+		print(f"Failed scraping {url}")		
+	
+	# Close the Selenium WebDriver instance
+	browser.quit()
+
+scrapeListing("https://abilene.craigslist.org/ctd/d/abilene-hyundai-elantra/7681061021.html")
\ No newline at end of file

From e08ac194a899dbc4352321a0333ee4b9c95b25d9 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 26 Oct 2023 22:15:16 -0500
Subject: [PATCH 09/48] can scrape description and attributes of craigslist
 listing

---
 src/scrapers/craigslist/listing.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/scrapers/craigslist/listing.py b/src/scrapers/craigslist/listing.py
index 4524184..4e22245 100644
--- a/src/scrapers/craigslist/listing.py
+++ b/src/scrapers/craigslist/listing.py
@@ -29,7 +29,11 @@ def scrapeListing(url):
 		description = soup.find('section', id='postingbody').text
 		attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
 		
-		print([attributes, description])
+		map = soup.find('div', id='map')
+		longitude = map["data-longitude"]
+		latitude = map["data-latitude"]
+
+		print([attributes, description, longitude, latitude])
 	except:
 		print(f"Failed scraping {url}")		
 	

From ef58802be059bf3d0c4b041b463a9928bd7ba1db Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 26 Oct 2023 22:27:02 -0500
Subject: [PATCH 10/48] replaced spaces with tabs

---
 src/scrapers/craigslist/homepage.py | 150 ++++++++++++++--------------
 1 file changed, 75 insertions(+), 75 deletions(-)

diff --git a/src/scrapers/craigslist/homepage.py b/src/scrapers/craigslist/homepage.py
index ad96903..a317838 100644
--- a/src/scrapers/craigslist/homepage.py
+++ b/src/scrapers/craigslist/homepage.py
@@ -6,112 +6,112 @@
 from datetime import date
 
 def scrollTo(x, driver):
-    driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
+	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
 
 def loadPageResources(driver):
-    scroll = 100
+	scroll = 100
 
-    print("Waiting to load...")
-    time.sleep(2)
+	print("Waiting to load...")
+	time.sleep(2)
 
-    scrollTo(scroll, driver)
+	scrollTo(scroll, driver)
 
-    loadImgButtons = driver.find_elements("class name", "slider-back-arrow")
+	loadImgButtons = driver.find_elements("class name", "slider-back-arrow")
 
-    time.sleep(2)
+	time.sleep(2)
 
-    # Emulate a user scrolling
-    for i in range(len(loadImgButtons)):
-        scroll += 100
-        scrollTo(scroll, driver)
+	# Emulate a user scrolling
+	for i in range(len(loadImgButtons)):
+		scroll += 100
+		scrollTo(scroll, driver)
 
-        driver.execute_script("arguments[0].click();", loadImgButtons[i])
+		driver.execute_script("arguments[0].click();", loadImgButtons[i])
 
-        time.sleep(.5)
+		time.sleep(.5)
 
 
 def setupURLs():
-    #list of cities to scrape; can be expanded
-    cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"]
+	#list of cities to scrape; can be expanded
+	cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"]
 
-    oldestAllowedCars = 2011
+	oldestAllowedCars = 2011
 
-    # Set the URL of the Facebook Marketplace automotive category
-    base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0'
-    return [base_url.format(city, oldestAllowedCars) for city in cities]
+	# Set the URL of the Facebook Marketplace automotive category
+	base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0'
+	return [base_url.format(city, oldestAllowedCars) for city in cities]
 
 def setupBrowser():
-    print("Setting up headless browser")
+	print("Setting up headless browser")
 
-    options = Options()
-    options.add_argument("--headless=new")
+	options = Options()
+	options.add_argument("--headless=new")
 
-    print("Creating a new Selenium WebDriver instance")
-    return webdriver.Chrome(options=options)
+	print("Creating a new Selenium WebDriver instance")
+	return webdriver.Chrome(options=options)
 
 def getAllPosts(browser):
-    # Create a BeautifulSoup object from the HTML of the page
-    html = browser.page_source
-    soup = BeautifulSoup(html, 'html.parser')
+	# Create a BeautifulSoup object from the HTML of the page
+	html = browser.page_source
+	soup = BeautifulSoup(html, 'html.parser')
 
-    # Find all of the car listings on the page
-    return soup.find_all('div', class_='gallery-card')
+	# Find all of the car listings on the page
+	return soup.find_all('div', class_='gallery-card')
 
 def getCarImages():
-    return "TODO"
+	return "TODO"
 
 def getCarInfo(post):
-    title = post.find('span', class_='label').text
-
-    print(f'Scraping "{title}"')
-
-    price = post.find('span', class_='priceinfo').text
-    metadata = post.find('div', class_="meta").text.split('·')
-
-    miles = metadata[1]
-    if (len(metadata) >= 3):
-        location = metadata[2]
-    
-    link = post.find('a', class_='posting-title', href=True)["href"]
-    
-    imageElements = post.findAll('img')
-    images = [img["src"] for img in imageElements]
-
-    return {
-        "title": title, 
-        "price": price, 
-        "location": location, 
-        "miles": miles, 
-        "link": link,
-        "images": images,
-        "scrapeDate": date.today()
-    }
+	title = post.find('span', class_='label').text
+
+	print(f'Scraping "{title}"')
+
+	price = post.find('span', class_='priceinfo').text
+	metadata = post.find('div', class_="meta").text.split('·')
+
+	miles = metadata[1]
+	if (len(metadata) >= 3):
+		location = metadata[2]
+	
+	link = post.find('a', class_='posting-title', href=True)["href"]
+	
+	imageElements = post.findAll('img')
+	images = [img["src"] for img in imageElements]
+
+	return {
+		"title": title, 
+		"price": price, 
+		"location": location, 
+		"miles": miles, 
+		"link": link,
+		"images": images,
+		"scrapeDate": date.today()
+	}
 
 def scrapeHomepage():
-    cityURLs = setupURLs()
-    browser = setupBrowser()
+	cityURLs = setupURLs()
+	browser = setupBrowser()
 
-    for url in cityURLs:
-        # Navigate to the URL
-        print(f"Going to {url}")
-        browser.get(url) 
+	for url in cityURLs:
+		# Navigate to the URL
+		print(f"Going to {url}")
+		browser.get(url) 
 
-        print(f"Loading cars from {url}")
+		print(f"Loading cars from {url}")
 
-        loadPageResources(browser)
+		loadPageResources(browser)
 
-        carPosts = getAllPosts(browser)
+		carPosts = getAllPosts(browser)
 
-        # Iterate over the listings and scrape the data
-        for post in carPosts:
-            try:
-                car = getCarInfo(post)
-                print(car)
-            except:
-                print("Incomplete listing info")
-                
-    # Close the Selenium WebDriver instance
-    browser.quit()
+		# Iterate over the listings and scrape the data
+		for post in carPosts:
+			try:
+				car = getCarInfo(post)
+				print(car)
+			except:
+				print("Incomplete listing info")
+				
+	# Close the Selenium WebDriver instance
+	browser.quit()
 
 if (__name__ == "__main__"):
-    scrapeHomepage()
\ No newline at end of file
+	scrapeHomepage()
\ No newline at end of file

From be1975ae809b2a38235ab4f58c4ee57fc82952a5 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Fri, 27 Oct 2023 10:24:55 -0500
Subject: [PATCH 11/48] updated miles label to odometer

---
 src/scrapers/craigslist/homepage.py | 7 ++++---
 src/scrapers/craigslist/listing.py  | 4 +---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/scrapers/craigslist/homepage.py b/src/scrapers/craigslist/homepage.py
index a317838..7788c40 100644
--- a/src/scrapers/craigslist/homepage.py
+++ b/src/scrapers/craigslist/homepage.py
@@ -68,7 +68,7 @@ def getCarInfo(post):
 	price = post.find('span', class_='priceinfo').text
 	metadata = post.find('div', class_="meta").text.split('·')
 
-	miles = metadata[1]
+	odometer = metadata[1]
 	if (len(metadata) >= 3):
 		location = metadata[2]
 	
@@ -78,13 +78,14 @@ def getCarInfo(post):
 	images = [img["src"] for img in imageElements]
 
 	return {
+		"_id": link,
 		"title": title, 
 		"price": price, 
 		"location": location, 
-		"miles": miles, 
+		"odometer": odometer, 
 		"link": link,
 		"images": images,
-		"scrapeDate": date.today()
+		"scrapeDate": str(date.today())
 	}
 
 def scrapeHomepage():
diff --git a/src/scrapers/craigslist/listing.py b/src/scrapers/craigslist/listing.py
index 4e22245..bd3a083 100644
--- a/src/scrapers/craigslist/listing.py
+++ b/src/scrapers/craigslist/listing.py
@@ -38,6 +38,4 @@ def scrapeListing(url):
 		print(f"Failed scraping {url}")		
 	
 	# Close the Selenium WebDriver instance
-	browser.quit()
-
-scrapeListing("https://abilene.craigslist.org/ctd/d/abilene-hyundai-elantra/7681061021.html")
\ No newline at end of file
+	browser.quit()
\ No newline at end of file

From ea32b66292232768fd349cfbb79e15821d20a2c7 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Mon, 30 Oct 2023 18:14:12 -0500
Subject: [PATCH 12/48] added .pyc files to .gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index c6bba59..f725835 100644
--- a/.gitignore
+++ b/.gitignore
@@ -128,3 +128,6 @@ dist
 .yarn/build-state.yml
 .yarn/install-state.gz
 .pnp.*
+
+# python
+*.pyc
\ No newline at end of file

From b3fe6af9ff895fbb7ef247a291dafba8e72ad5e1 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Mon, 30 Oct 2023 18:16:36 -0500
Subject: [PATCH 13/48] removed un-used import

---
 src/scrapers/craigslist/homepage.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/scrapers/craigslist/homepage.py b/src/scrapers/craigslist/homepage.py
index 7788c40..482cf6d 100644
--- a/src/scrapers/craigslist/homepage.py
+++ b/src/scrapers/craigslist/homepage.py
@@ -1,7 +1,6 @@
 from selenium import webdriver
 from bs4 import BeautifulSoup
 from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.action_chains import ActionChains
 import time
 from datetime import date
 
@@ -31,7 +30,7 @@ def loadPageResources(driver):
 
 
 def setupURLs():
-	#list of cities to scrape; can be expanded
+	# List of TX cities to scrape; can be expanded
 	cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"]
 
 	oldestAllowedCars = 2011

From 66ac3dd453c793a811e1b767081ffd07c6358ce8 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Mon, 30 Oct 2023 20:09:00 -0500
Subject: [PATCH 14/48] delete duplicate craigslist file

---
 src/scrapers/craigslist.py | 98 --------------------------------------
 1 file changed, 98 deletions(-)
 delete mode 100644 src/scrapers/craigslist.py

diff --git a/src/scrapers/craigslist.py b/src/scrapers/craigslist.py
deleted file mode 100644
index 1989d9a..0000000
--- a/src/scrapers/craigslist.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from datetime import datetime
-import csv
-import json
-import requests
-
-location_to_batch = {
-	"newyork": "3-0-360-0-0",
-	"philadelphia": "17-0-360-0-0",
-	"dallas": "21-0-360-0-0",
-	# Add more locations and their batch values as needed
-}
-
-def clean_price_str(str):
-	price_str = str.replace("$", "").replace(",", "")
-	return float(price_str)
-
-def fetch_job_postings(location, category):
-	base_url = "https://sapi.craigslist.org/web/v8/postings/search/full"
-
-	# Get the batch value and category abbreviation from the mappings
-	# Default to New York if location not found
-	batch = location_to_batch.get(location)
-
-	params = {
-		'batch': batch,
-		'cc': 'US',
-		'lang': 'en',
-		'searchPath': "cta",
-		"id": "0",
-  		"collectContactInfo": True,
-	}
-
-	headers = {
-		'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
-		'Referer': f'https://{location}.craigslist.org/',
-		'sec-ch-ua-mobile': '?0',
-		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
-		'sec-ch-ua-platform': '"Windows"',
-		'Cookie': f'cl_b=COOKIE VALUE'
-	}
-
-	response = requests.get(base_url, params=params, headers=headers)
-
-	if response.status_code == 200:
-		data = response.json()
-
-		with open('file.txt', 'w') as f:
-			json.dump(data["data"]["items"], f, indent=2)
-	else:
-		print("Failed to retrieve data. Status code:", response.status_code)
-		data = None
-
-
-	car_posts = []
-	if data:
-		# For each car post found
-		for post in data["data"]["items"]:
-			title = None
-			price = None
-			mileage = None
-			partial_link = None
-
-			for element in post:
-				if isinstance(element, str):
-					title = element
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 10:
-					price = clean_price_str(element[1])
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 9:
-					mileage = element[1]
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 6:
-					partial_link = element[1]
-			if title and price and mileage and partial_link:
-				car_posts.append((title, price, mileage, partial_link))
-		return car_posts
-	else:
-		print("No data available.")
-
-if __name__ == "__main__":
-	location = "dallas"
-	category = "cta"
-	
-	car_posts = fetch_job_postings(location, category)
-
-	if car_posts:
-		current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
-		category = category.replace("/", "&")
-		csv_filename = f"{location}_{category}_openings_{current_datetime}.csv"
-
-		with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
-			writer = csv.writer(file)
-
-			writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"])
-			for car in car_posts:
-				writer.writerow([car[0], car[1], car[2], car[3]])
-	
-		print(f"Car posts have been saved to {csv_filename}")
-	else:
-		print("No car posts were found. Nothing was saved")
\ No newline at end of file

From 8d2acb4dc8b1ceed0049be407f3ff5231769c553 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Mon, 30 Oct 2023 20:11:44 -0500
Subject: [PATCH 15/48] moved scrapers dir to root

---
 {src/scrapers => scrapers}/craigslist-api.py      | 0
 {src/scrapers => scrapers}/craigslist/homepage.py | 0
 {src/scrapers => scrapers}/craigslist/listing.py  | 0
 {src/scrapers => scrapers}/database.py            | 0
 {src/scrapers => scrapers}/facebook.py            | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename {src/scrapers => scrapers}/craigslist-api.py (100%)
 rename {src/scrapers => scrapers}/craigslist/homepage.py (100%)
 rename {src/scrapers => scrapers}/craigslist/listing.py (100%)
 rename {src/scrapers => scrapers}/database.py (100%)
 rename {src/scrapers => scrapers}/facebook.py (100%)

diff --git a/src/scrapers/craigslist-api.py b/scrapers/craigslist-api.py
similarity index 100%
rename from src/scrapers/craigslist-api.py
rename to scrapers/craigslist-api.py
diff --git a/src/scrapers/craigslist/homepage.py b/scrapers/craigslist/homepage.py
similarity index 100%
rename from src/scrapers/craigslist/homepage.py
rename to scrapers/craigslist/homepage.py
diff --git a/src/scrapers/craigslist/listing.py b/scrapers/craigslist/listing.py
similarity index 100%
rename from src/scrapers/craigslist/listing.py
rename to scrapers/craigslist/listing.py
diff --git a/src/scrapers/database.py b/scrapers/database.py
similarity index 100%
rename from src/scrapers/database.py
rename to scrapers/database.py
diff --git a/src/scrapers/facebook.py b/scrapers/facebook.py
similarity index 100%
rename from src/scrapers/facebook.py
rename to scrapers/facebook.py

From b4cc2f4c49e626bad01460cb1e2196c3b06b2bdf Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 10:45:45 -0500
Subject: [PATCH 16/48] Grouped homepage and listing files into one craigslist
 file

---
 .../{craigslist/homepage.py => craigslist.py} | 71 +++++++++----------
 scrapers/craigslist/listing.py                | 41 -----------
 scrapers/scrapers.py                          | 42 +++++++++++
 3 files changed, 76 insertions(+), 78 deletions(-)
 rename scrapers/{craigslist/homepage.py => craigslist.py} (67%)
 delete mode 100644 scrapers/craigslist/listing.py
 create mode 100644 scrapers/scrapers.py

diff --git a/scrapers/craigslist/homepage.py b/scrapers/craigslist.py
similarity index 67%
rename from scrapers/craigslist/homepage.py
rename to scrapers/craigslist.py
index 482cf6d..66bc8df 100644
--- a/scrapers/craigslist/homepage.py
+++ b/scrapers/craigslist.py
@@ -2,7 +2,6 @@
 from bs4 import BeautifulSoup
 from selenium.webdriver.chrome.options import Options
 import time
-from datetime import date
 
 def scrollTo(x, driver):
 	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
@@ -29,12 +28,10 @@ def loadPageResources(driver):
 		time.sleep(.5)
 
 
-def setupURLs():
+def setupURLs(oldestAllowedCars):
 	# List of TX cities to scrape; can be expanded
 	cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"]
 
-	oldestAllowedCars = 2011
-
 	# Set the URL of the Facebook Marketplace automotive category
 	base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0'
 	return [base_url.format(city, oldestAllowedCars) for city in cities]
@@ -76,42 +73,42 @@ def getCarInfo(post):
 	imageElements = post.findAll('img')
 	images = [img["src"] for img in imageElements]
 
-	return {
-		"_id": link,
-		"title": title, 
-		"price": price, 
-		"location": location, 
-		"odometer": odometer, 
-		"link": link,
-		"images": images,
-		"scrapeDate": str(date.today())
-	}
-
-def scrapeHomepage():
-	cityURLs = setupURLs()
-	browser = setupBrowser()
+	return title, price, location, odometer, link, images
 
-	for url in cityURLs:
-		# Navigate to the URL
-		print(f"Going to {url}")
-		browser.get(url) 
+def processAttributes(attributes):
+	processedAttributes = []
+	
+	for attr in attributes:
+		[label, value] = attr.split(": ")
+		processedAttributes.append({"label": label, "value": value})
 
-		print(f"Loading cars from {url}")
+	return processedAttributes
 
-		loadPageResources(browser)
+def scrapeListing(url):
+	browser = setupBrowser()
 
-		carPosts = getAllPosts(browser)
+	# Navigate to the URL
+	print(f"Going to {url}")
+	browser.get(url) 
 
-		# Iterate over the listings and scrape the data
-		for post in carPosts:
-			try:
-				car = getCarInfo(post)
-				print(car)
-			except:
-				print("Incomplete listing info")
-				
-	# Close the Selenium WebDriver instance
-	browser.quit()
+	print(f"Loading page for {url}")
+	time.sleep(1)
 
-if (__name__ == "__main__"):
-	scrapeHomepage()
\ No newline at end of file
+	# Create a BeautifulSoup object from the HTML of the page
+	html = browser.page_source
+	soup = BeautifulSoup(html, 'html.parser')
+
+	try:
+		description = soup.find('section', id='postingbody').text
+		attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
+		
+		map = soup.find('div', id='map')
+		longitude = map["data-longitude"]
+		latitude = map["data-latitude"]
+
+		print([attributes, description, longitude, latitude])
+	except:
+		print(f"Failed scraping {url}")		
+	
+	# Close the Selenium WebDriver instance
+	browser.quit()
\ No newline at end of file
diff --git a/scrapers/craigslist/listing.py b/scrapers/craigslist/listing.py
deleted file mode 100644
index bd3a083..0000000
--- a/scrapers/craigslist/listing.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import time
-from bs4 import BeautifulSoup
-from homepage import setupBrowser
-
-def processAttributes(attributes):
-	processedAttributes = []
-	
-	for attr in attributes:
-		[label, value] = attr.split(": ")
-		processedAttributes.append({"label": label, "value": value})
-
-	return processedAttributes
-
-def scrapeListing(url):
-	browser = setupBrowser()
-
-	# Navigate to the URL
-	print(f"Going to {url}")
-	browser.get(url) 
-
-	print(f"Loading page for {url}")
-	time.sleep(1)
-
-	# Create a BeautifulSoup object from the HTML of the page
-	html = browser.page_source
-	soup = BeautifulSoup(html, 'html.parser')
-
-	try:
-		description = soup.find('section', id='postingbody').text
-		attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
-		
-		map = soup.find('div', id='map')
-		longitude = map["data-longitude"]
-		latitude = map["data-latitude"]
-
-		print([attributes, description, longitude, latitude])
-	except:
-		print(f"Failed scraping {url}")		
-	
-	# Close the Selenium WebDriver instance
-	browser.quit()
\ No newline at end of file
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
new file mode 100644
index 0000000..8c14383
--- /dev/null
+++ b/scrapers/scrapers.py
@@ -0,0 +1,42 @@
+import craigslist as cl
+import database as db
+from typing import Optional
+import typer
+from typing_extensions import Annotated
+
+app = typer.Typer()
+
+@app.command()
+def craigslist(minYear: Annotated[Optional[int], typer.Argument()] = 2011):
+	cityURLs = cl.setupURLs(minYear)
+	browser = cl.setupBrowser()
+
+	for url in cityURLs:
+		print(f"Going to {url}")
+		browser.get(url) 
+
+		print(f"Loading cars from {url}")
+		cl.loadPageResources(browser)
+
+		carPosts = cl.getAllPosts(browser)
+
+		for post in carPosts:
+			try:
+				title, price, location, odometer, link, images = cl.getCarInfo(post)
+				db.post_raw("craigslist", title, price, location, odometer, link, images)
+			except Exception as error:
+				print(error)
+				
+	browser.quit()
+
+@app.command()
+def link(link: str):
+	if (".craigslist.org" in link):
+		cl.scrapeListing(link)
+	elif("https://www.facebook.com/marketplace" in link):
+		print("facebook marketplace")
+	else:
+		print("Not a Craigslist nor a Facebook Marketplace link")
+
+if __name__ == "__main__":
+    app()
\ No newline at end of file

From b2a1087d1cc376acab2d29995e999fc84f509417 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 10:46:32 -0500
Subject: [PATCH 17/48] Removed craigslist-api file

---
 scrapers/craigslist-api.py | 98 --------------------------------------
 1 file changed, 98 deletions(-)
 delete mode 100644 scrapers/craigslist-api.py

diff --git a/scrapers/craigslist-api.py b/scrapers/craigslist-api.py
deleted file mode 100644
index 1989d9a..0000000
--- a/scrapers/craigslist-api.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from datetime import datetime
-import csv
-import json
-import requests
-
-location_to_batch = {
-	"newyork": "3-0-360-0-0",
-	"philadelphia": "17-0-360-0-0",
-	"dallas": "21-0-360-0-0",
-	# Add more locations and their batch values as needed
-}
-
-def clean_price_str(str):
-	price_str = str.replace("$", "").replace(",", "")
-	return float(price_str)
-
-def fetch_job_postings(location, category):
-	base_url = "https://sapi.craigslist.org/web/v8/postings/search/full"
-
-	# Get the batch value and category abbreviation from the mappings
-	# Default to New York if location not found
-	batch = location_to_batch.get(location)
-
-	params = {
-		'batch': batch,
-		'cc': 'US',
-		'lang': 'en',
-		'searchPath': "cta",
-		"id": "0",
-  		"collectContactInfo": True,
-	}
-
-	headers = {
-		'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
-		'Referer': f'https://{location}.craigslist.org/',
-		'sec-ch-ua-mobile': '?0',
-		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
-		'sec-ch-ua-platform': '"Windows"',
-		'Cookie': f'cl_b=COOKIE VALUE'
-	}
-
-	response = requests.get(base_url, params=params, headers=headers)
-
-	if response.status_code == 200:
-		data = response.json()
-
-		with open('file.txt', 'w') as f:
-			json.dump(data["data"]["items"], f, indent=2)
-	else:
-		print("Failed to retrieve data. Status code:", response.status_code)
-		data = None
-
-
-	car_posts = []
-	if data:
-		# For each car post found
-		for post in data["data"]["items"]:
-			title = None
-			price = None
-			mileage = None
-			partial_link = None
-
-			for element in post:
-				if isinstance(element, str):
-					title = element
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 10:
-					price = clean_price_str(element[1])
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 9:
-					mileage = element[1]
-				elif isinstance(element, list) and len(element) > 0 and element[0] == 6:
-					partial_link = element[1]
-			if title and price and mileage and partial_link:
-				car_posts.append((title, price, mileage, partial_link))
-		return car_posts
-	else:
-		print("No data available.")
-
-if __name__ == "__main__":
-	location = "dallas"
-	category = "cta"
-	
-	car_posts = fetch_job_postings(location, category)
-
-	if car_posts:
-		current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
-		category = category.replace("/", "&")
-		csv_filename = f"{location}_{category}_openings_{current_datetime}.csv"
-
-		with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
-			writer = csv.writer(file)
-
-			writer.writerow(["Title", "Price", "Mileage", "Partial HTML Path"])
-			for car in car_posts:
-				writer.writerow([car[0], car[1], car[2], car[3]])
-	
-		print(f"Car posts have been saved to {csv_filename}")
-	else:
-		print("No car posts were found. Nothing was saved")
\ No newline at end of file

From f1c73723903cf6017ad757a64caa46913ed3148c Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 10:51:19 -0500
Subject: [PATCH 18/48] extracted utils from craigslist scraper

---
 scrapers/craigslist.py | 19 +++----------------
 scrapers/scrapers.py   | 11 +++++++----
 scrapers/utils.py      | 14 ++++++++++++++
 3 files changed, 24 insertions(+), 20 deletions(-)
 create mode 100644 scrapers/utils.py

diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py
index 66bc8df..ca37236 100644
--- a/scrapers/craigslist.py
+++ b/scrapers/craigslist.py
@@ -1,10 +1,6 @@
-from selenium import webdriver
 from bs4 import BeautifulSoup
-from selenium.webdriver.chrome.options import Options
 import time
-
-def scrollTo(x, driver):
-	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
+import utils
 
 def loadPageResources(driver):
 	scroll = 100
@@ -12,7 +8,7 @@ def loadPageResources(driver):
 	print("Waiting to load...")
 	time.sleep(2)
 
-	scrollTo(scroll, driver)
+	utils.scrollTo(scroll, driver)
 
 	loadImgButtons = driver.find_elements("class name", "slider-back-arrow")
 
@@ -21,7 +17,7 @@ def loadPageResources(driver):
 	# Emulate a user scrolling
 	for i in range(len(loadImgButtons)):
 		scroll += 100
-		scrollTo(scroll, driver)
+		utils.scrollTo(scroll, driver)
 
 		driver.execute_script("arguments[0].click();", loadImgButtons[i])
 
@@ -36,15 +32,6 @@ def setupURLs(oldestAllowedCars):
 	base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0'
 	return [base_url.format(city, oldestAllowedCars) for city in cities]
 
-def setupBrowser():
-	print("Setting up headless browser")
-
-	options = Options()
-	options.add_argument("--headless=new")
-
-	print("Creating a new Selenium WebDriver instance")
-	return webdriver.Chrome(options=options)
-
 def getAllPosts(browser):
 	# Create a BeautifulSoup object from the HTML of the page
 	html = browser.page_source
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index 8c14383..e721237 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -1,15 +1,18 @@
-import craigslist as cl
-import database as db
 from typing import Optional
-import typer
 from typing_extensions import Annotated
+import typer
+
+import craigslist as cl
+import facebook as fb
+import database as db
+import utils
 
 app = typer.Typer()
 
 @app.command()
 def craigslist(minYear: Annotated[Optional[int], typer.Argument()] = 2011):
 	cityURLs = cl.setupURLs(minYear)
-	browser = cl.setupBrowser()
+	browser = utils.setupBrowser()
 
 	for url in cityURLs:
 		print(f"Going to {url}")
diff --git a/scrapers/utils.py b/scrapers/utils.py
new file mode 100644
index 0000000..2a5a865
--- /dev/null
+++ b/scrapers/utils.py
@@ -0,0 +1,14 @@
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+def scrollTo(x, driver):
+	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
+
+def setupBrowser():
+	print("Setting up headless browser")
+
+	options = Options()
+	options.add_argument("--headless=new")
+
+	print("Creating a new Selenium WebDriver instance")
+	return webdriver.Chrome(options=options)
\ No newline at end of file

From 5e694dd9dea32e4d44ae818799220d9ec4c42c81 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 11:03:48 -0500
Subject: [PATCH 19/48] removed un-used function

---
 scrapers/craigslist.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py
index ca37236..213cf45 100644
--- a/scrapers/craigslist.py
+++ b/scrapers/craigslist.py
@@ -40,9 +40,6 @@ def getAllPosts(browser):
 	# Find all of the car listings on the page
 	return soup.find_all('div', class_='gallery-card')
 
-def getCarImages():
-	return "TODO"
-
 def getCarInfo(post):
 	title = post.find('span', class_='label').text
 

From 912006b0bc68d9804a4f668184800f3ad309e752 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 11:36:48 -0500
Subject: [PATCH 20/48] Updated facebook scraper

---
 scrapers/facebook.py | 206 ++++++++++++++++++++-----------------------
 scrapers/scrapers.py |  27 +++++-
 2 files changed, 123 insertions(+), 110 deletions(-)

diff --git a/scrapers/facebook.py b/scrapers/facebook.py
index 00b5e45..5b8c43f 100644
--- a/scrapers/facebook.py
+++ b/scrapers/facebook.py
@@ -1,111 +1,101 @@
-from selenium import webdriver
 from bs4 import BeautifulSoup
-from selenium.webdriver.chrome.options import Options
 import time
+import utils
 
-import database
-
-
-#list of cities to scrape; can be expanded
-cities = [
-    'nyc', 'la', 'chicago', 'houston', 'miami', 
-    'philadelphia', 'phoenix', 'sanantonio', 'sandiego', 'dallas', 
-    'sanjose', 'austin', 'jacksonville', 'fortworth', 'columbus', 
-    'charlotte', 'sanfrancisco', 'indianapolis', 'seattle', 'denver', 
-    'washington', 'boston', 'elpaso', 'nashville', 'detroit', 'portland', 'lasvegas', 'memphis', 'louisville', 
-    'baltimore', 'milwaukee', 'albuquerque', 'tucson', 'fresno', 
-    'kansascity', 'mesa', 'atlanta', 
-    'coloradosprings', 'virginiabeach', 'raleigh', 'omaha', 'miami',
-    'oakland', 'minneapolis', 'tulsa', 'wichita', 'neworleans'
-]
-
-# Set the URL of the Facebook Marketplace automotive category
-base_url = 'https://www.facebook.com/marketplace/{}/vehicles'
-urls = [base_url.format(city) for city in cities]
-
-# Create a new Selenium WebDriver instance
-
-print("Setting up headless browser")
-options = Options()
-options.add_argument("--headless=new")
-
-print("Creating a new Selenium WebDriver instance")
-driver = webdriver.Chrome(options=options)
-
-# Create a list to store the scraped data
-print("Started scraping...")
-data = {}
-for url in urls:
-    # Navigate to the URL
-    print(f"Navigating to {url}")
-    driver.get(url)
-
-    print(f"Loading {url}")
-
-    time.sleep(2)
-    scroll = 2000
-
-    # Wait for the page to load
-    time.sleep(2)
-
-    for i in range(50):
-        driver.execute_script(f"window.scrollTo(1, {scroll})")
-        scroll += 1000
-        time.sleep(.5)
-
-    # Get the HTML of the page
-    html = driver.page_source
-
-    # Create a BeautifulSoup object from the HTML
-    soup = BeautifulSoup(html, 'html.parser')
-
-    # Find all of the automotive listings on the page
-    car_posts = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24')
-
-    # Iterate over the listings and scrape the data
-    for post in car_posts:
-        print("Scraping new listing")
-        try:
-            # Get the title of the listing
-            title = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text
-        except AttributeError:
-            title = 'N/A'  # Handle missing title
-        
-        try:
-            # Get the price of the listing
-            price = post.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text
-        except AttributeError:
-            price = 'N/A'  # Handle missing price
-        
-        try:
-            # Get the location of the listing
-            location = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text
-        except AttributeError:
-            location = 'N/A'  # Handle missing location
-        
-        try:
-            # Get the miles of the car
-            miles = post.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text
-        except (AttributeError, IndexError):
-            miles = 'N/A'  # Handle missing miles
-
-        try:
-            # Get the link to the listing
-            link = 'https://www.facebook.com' + post.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href']
-        except (AttributeError, TypeError):
-            link = 'N/A'  # Handle missing link
-
-        # Add the data to the list
-        if (title, price, location, miles, link) not in data:
-            data[(title, price, location, miles, link)] = True
-            postSuccess = database.post_raw("facebook", title, price, location, miles, link)
-            if (postSuccess):
-                print("Save to DB")
-            else:
-                print("Failed to save to DB")
-        else:
-            print("Listing is a duplicate")
-
-
-# Close the Selenium WebDriver instance
-driver.quit()
\ No newline at end of file
+postClass = "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24"
+linkClass = "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv"
+thumbnailClass = "xt7dq6l xl1xv1r x6ikm8r x10wlt62 xh8yej3"
+titleClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6"
+priceClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7 x1s688f xzsf02u"
+metaClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft"
+
+def loadPageResources(driver):
+	scroll = 100
+
+	print("Waiting to load...")
+	time.sleep(2)
+	utils.scrollTo(scroll, driver)
+	time.sleep(1.5)
+
+	# Emulate a user scrolling
+	for i in range(10):
+		scroll += 1000
+		utils.scrollTo(scroll, driver)
+		time.sleep(1)
+
+
+def setupURLs(oldestAllowedCars):
+	# List of TX cities to scrape; can be expanded
+	cities = ['houston', 'dallas', 'austin', 'fortworth', 'elpaso', 'sanantonio']
+
+	# Set the URL of the Facebook Marketplace automotive category
+	base_url = 'https://www.facebook.com/marketplace/{}/vehicles?minYear={}&exact=false'
+	return [base_url.format(city, oldestAllowedCars) for city in cities]
+
+def getAllPosts(browser):
+	# Create a BeautifulSoup object from the HTML of the page
+	html = browser.page_source
+	soup = BeautifulSoup(html, 'html.parser')
+
+	# Find all of the car listings on the page
+	return soup.find_all('div', class_=postClass)
+
+def getCarInfo(post):
+	title = post.find('span', class_=titleClass).text
+
+	print(f'Scraping "{title}"')
+
+	price = post.find('span', class_=priceClass).text
+	metadata = post.findAll('span', class_=metaClass)
+
+	location = metadata[0].text
+	odometer = metadata[1].text
+
+	link = post.find('a', class_=linkClass, href=True)["href"]
+	link = "https://facebook.com" + link
+	
+	thumbnail = post.find('img', class_=thumbnailClass)["src"]
+
+	return title, price, location, odometer, link, [thumbnail]
+
+def getCarImages():
+	# class="x1a0syf3 x1ja2u2z"
+	return "TODO"
+
+def processAttributes(attributes):
+	processedAttributes = []
+	
+	for attr in attributes:
+		[label, value] = attr.split(": ")
+		processedAttributes.append({"label": label, "value": value})
+
+	return processedAttributes
+
+def scrapeListing(url):
+	browser = setupBrowser()
+
+	# Navigate to the URL
+	print(f"Going to {url}")
+	browser.get(url) 
+
+	print(f"Loading page for {url}")
+	time.sleep(1)
+
+	# Create a BeautifulSoup object from the HTML of the page
+	html = browser.page_source
+	soup = BeautifulSoup(html, 'html.parser')
+
+	try:
+		description = soup.find('section', id='postingbody').text
+		attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
+		
+		map = soup.find('div', id='map')
+		longitude = map["data-longitude"]
+		latitude = map["data-latitude"]
+
+		print([attributes, description, longitude, latitude])
+	except:
+		print(f"Failed scraping {url}")		
+	
+	# Close the Selenium WebDriver instance
+	browser.quit()
\ No newline at end of file
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index e721237..883d3fb 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -10,8 +10,8 @@
 app = typer.Typer()
 
 @app.command()
-def craigslist(minYear: Annotated[Optional[int], typer.Argument()] = 2011):
-	cityURLs = cl.setupURLs(minYear)
+def craigslist():
+	cityURLs = cl.setupURLs(2011)
 	browser = utils.setupBrowser()
 
 	for url in cityURLs:
@@ -32,6 +32,29 @@ def craigslist(minYear: Annotated[Optional[int], typer.Argument()] = 2011):
 				
 	browser.quit()
 
+@app.command()
+def facebook():
+	cityURLs = fb.setupURLs(2011)
+	browser = utils.setupBrowser()
+
+	for url in cityURLs:
+		print(f"Going to {url}")
+		browser.get(url) 
+
+		print(f"Loading cars from {url}")
+		fb.loadPageResources(browser)
+
+		carPosts = fb.getAllPosts(browser)
+
+		for post in carPosts:
+			try:
+				title, price, location, odometer, link, images = fb.getCarInfo(post)
+				db.post_raw("facebook", title, price, location, odometer, link, images)
+			except Exception as error:
+				print(error)
+				
+	browser.quit()
+
 @app.command()
 def link(link: str):
 	if (".craigslist.org" in link):

From f1298de3adc3a27947040eb2171fbb1247934635 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 11:40:56 -0500
Subject: [PATCH 21/48] Extracted scraper logic into utils

---
 scrapers/scrapers.py | 51 ++++++--------------------------------------
 scrapers/utils.py    | 26 +++++++++++++++++++++-
 2 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index 883d3fb..b10029f 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -2,65 +2,26 @@
 from typing_extensions import Annotated
 import typer
 
-import craigslist as cl
-import facebook as fb
-import database as db
+import craigslist
+import facebook
 import utils
 
 app = typer.Typer()
 
 @app.command()
 def craigslist():
-	cityURLs = cl.setupURLs(2011)
-	browser = utils.setupBrowser()
-
-	for url in cityURLs:
-		print(f"Going to {url}")
-		browser.get(url) 
-
-		print(f"Loading cars from {url}")
-		cl.loadPageResources(browser)
-
-		carPosts = cl.getAllPosts(browser)
-
-		for post in carPosts:
-			try:
-				title, price, location, odometer, link, images = cl.getCarInfo(post)
-				db.post_raw("craigslist", title, price, location, odometer, link, images)
-			except Exception as error:
-				print(error)
-				
-	browser.quit()
+	utils.scrape(craigslist, "craigslist")
 
 @app.command()
 def facebook():
-	cityURLs = fb.setupURLs(2011)
-	browser = utils.setupBrowser()
-
-	for url in cityURLs:
-		print(f"Going to {url}")
-		browser.get(url) 
-
-		print(f"Loading cars from {url}")
-		fb.loadPageResources(browser)
-
-		carPosts = fb.getAllPosts(browser)
-
-		for post in carPosts:
-			try:
-				title, price, location, odometer, link, images = fb.getCarInfo(post)
-				db.post_raw("facebook", title, price, location, odometer, link, images)
-			except Exception as error:
-				print(error)
-				
-	browser.quit()
+	utils.scrape(facebook, "facebook")
 
 @app.command()
 def link(link: str):
 	if (".craigslist.org" in link):
-		cl.scrapeListing(link)
+		craigslist.scrapeListing(link)
 	elif("https://www.facebook.com/marketplace" in link):
-		print("facebook marketplace")
+		facebook.scrapeListing(link)
 	else:
 		print("Not a Craigslist nor a Facebook Marketplace link")
 
diff --git a/scrapers/utils.py b/scrapers/utils.py
index 2a5a865..d8cc307 100644
--- a/scrapers/utils.py
+++ b/scrapers/utils.py
@@ -1,5 +1,7 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
+import utils
+import database as db
 
 def scrollTo(x, driver):
 	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
@@ -11,4 +13,26 @@ def setupBrowser():
 	options.add_argument("--headless=new")
 
 	print("Creating a new Selenium WebDriver instance")
-	return webdriver.Chrome(options=options)
\ No newline at end of file
+	return webdriver.Chrome(options=options)
+
+def scrape(scraper, website):
+	cityURLs = scraper.setupURLs(2011)
+	browser = utils.setupBrowser()
+
+	for url in cityURLs:
+		print(f"Going to {url}")
+		browser.get(url) 
+
+		print(f"Loading cars from {url}")
+		scraper.loadPageResources(browser)
+
+		carPosts = scraper.getAllPosts(browser)
+
+		for post in carPosts:
+			try:
+				title, price, location, odometer, link, images = scraper.getCarInfo(post)
+				db.post_raw(website, title, price, location, odometer, link, images)
+			except Exception as error:
+				print(error)
+				
+	browser.quit()
\ No newline at end of file

From f6c0a03b49e36cd7ae5c5b3f403c4ce195ae4d9f Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 11:41:31 -0500
Subject: [PATCH 22/48] removed un-used imports

---
 scrapers/scrapers.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index b10029f..efaf399 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -1,7 +1,4 @@
-from typing import Optional
-from typing_extensions import Annotated
 import typer
-
 import craigslist
 import facebook
 import utils

From 70c01d4f24b38c1925c28cbc2a2af9cc8f6db521 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 12:02:01 -0500
Subject: [PATCH 23/48] Track scraper versions in db

---
 scrapers/database.py | 9 +++++----
 scrapers/scrapers.py | 7 +++++--
 scrapers/utils.py    | 4 ++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/scrapers/database.py b/scrapers/database.py
index ed36ac8..10916ca 100644
--- a/scrapers/database.py
+++ b/scrapers/database.py
@@ -20,15 +20,16 @@ def get_conn(db):
   # use a database named "Test"
   return {"success" : True, "db": client.get_database(db)}
 
-def post_raw(source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None):
+def post_raw(scraperVersion, source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None):
   car = {
+    "source": source,
+    "scraperVersion": scraperVersion,
+    "scrapeDate": str(date.today()),
     "title": title, 
     "price": price, 
     "location": location, 
     "odometer": miles, 
-    "link": link,
-    "source": source,
-    "scrapeDate": str(date.today())
+    "link": link
   }
 
   if (images is not None):
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index efaf399..8f6d92a 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -5,13 +5,16 @@
 
 app = typer.Typer()
 
+craigslistScraperVersion = 1
+facebookScraperVersion = 1
+
 @app.command()
 def craigslist():
-	utils.scrape(craigslist, "craigslist")
+	utils.scrape(craigslist, "craigslist", craigslistScraperVersion)
 
 @app.command()
 def facebook():
-	utils.scrape(facebook, "facebook")
+	utils.scrape(facebook, "facebook", facebookScraperVersion)
 
 @app.command()
 def link(link: str):
diff --git a/scrapers/utils.py b/scrapers/utils.py
index d8cc307..2daf8f7 100644
--- a/scrapers/utils.py
+++ b/scrapers/utils.py
@@ -15,7 +15,7 @@ def setupBrowser():
 	print("Creating a new Selenium WebDriver instance")
 	return webdriver.Chrome(options=options)
 
-def scrape(scraper, website):
+def scrape(scraper, website, scraperVersion):
 	cityURLs = scraper.setupURLs(2011)
 	browser = utils.setupBrowser()
 
@@ -31,7 +31,7 @@ def scrape(scraper, website):
 		for post in carPosts:
 			try:
 				title, price, location, odometer, link, images = scraper.getCarInfo(post)
-				db.post_raw(website, title, price, location, odometer, link, images)
+				db.post_raw(scraperVersion, website, title, price, location, odometer, link, images)
 			except Exception as error:
 				print(error)
 				

From 2182688247f0ba3db1d163358e6fefb2e51a685e Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 12:48:46 -0500
Subject: [PATCH 24/48] use link as _id for db

---
 scrapers/craigslist.py |  2 +-
 scrapers/database.py   | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py
index 213cf45..1f4553d 100644
--- a/scrapers/craigslist.py
+++ b/scrapers/craigslist.py
@@ -69,7 +69,7 @@ def processAttributes(attributes):
 	return processedAttributes
 
 def scrapeListing(url):
-	browser = setupBrowser()
+	browser = utils.setupBrowser()
 
 	# Navigate to the URL
 	print(f"Going to {url}")
diff --git a/scrapers/database.py b/scrapers/database.py
index 10916ca..e168344 100644
--- a/scrapers/database.py
+++ b/scrapers/database.py
@@ -22,6 +22,7 @@ def get_conn(db):
 
 def post_raw(scraperVersion, source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None):
   car = {
+    "_id": link,
     "source": source,
     "scraperVersion": scraperVersion,
     "scrapeDate": str(date.today()),
@@ -49,10 +50,24 @@ def post_raw(scraperVersion, source, title, price, location, miles, link, images
       car[attr["label"]] = attr["value"]
 
   # Insert into collection called "scrape_test"
-  conn = get_conn("scrape")
+  conn = get_conn("Test")
 
   if (conn["success"]):
-    result = conn["db"]["scraped_raw"].insert_one(car)
+    result = conn["db"]["raw"].insert_one(car)
     return result.acknowledged
   else:
-    return False
\ No newline at end of file
+    return False
+
+def update(link, newFields):
+  conn = get_conn("Test")
+  if (conn["success"]):
+    result = conn["db"]["raw"].update(
+      {'_id': link},
+      {
+        '$set': newFields
+      }
+    )
+    return result.acknowledged
+  else:
+    return False
+

From 146948aa643a9128b469a70cbbc7483e68592e59 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 12:49:23 -0500
Subject: [PATCH 25/48] Fixed import issues

---
 scrapers/scrapers.py | 12 ++++++------
 scrapers/utils.py    | 16 ++++++++++++++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index 8f6d92a..aeaab0d 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -1,6 +1,6 @@
 import typer
-import craigslist
-import facebook
+import craigslist as cl
+import facebook as fb
 import utils
 
 app = typer.Typer()
@@ -10,18 +10,18 @@
 
 @app.command()
 def craigslist():
-	utils.scrape(craigslist, "craigslist", craigslistScraperVersion)
+	utils.scrape("craigslist", craigslistScraperVersion)
 
 @app.command()
 def facebook():
-	utils.scrape(facebook, "facebook", facebookScraperVersion)
+	utils.scrape("facebook", facebookScraperVersion)
 
 @app.command()
 def link(link: str):
 	if (".craigslist.org" in link):
-		craigslist.scrapeListing(link)
+		cl.scrapeListing(link)
 	elif("https://www.facebook.com/marketplace" in link):
-		facebook.scrapeListing(link)
+		fb.scrapeListing(link)
 	else:
 		print("Not a Craigslist nor a Facebook Marketplace link")
 
diff --git a/scrapers/utils.py b/scrapers/utils.py
index 2daf8f7..a62a1bc 100644
--- a/scrapers/utils.py
+++ b/scrapers/utils.py
@@ -3,6 +3,9 @@
 import utils
 import database as db
 
+import craigslist
+import facebook
+
 def scrollTo(x, driver):
 	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
 
@@ -15,7 +18,12 @@ def setupBrowser():
 	print("Creating a new Selenium WebDriver instance")
 	return webdriver.Chrome(options=options)
 
-def scrape(scraper, website, scraperVersion):
+def scrape(website, scraperVersion):
+	if (website == 'craigslist'):
+		scraper = craigslist
+	elif (website == 'facebook'):
+		scraper = facebook
+
 	cityURLs = scraper.setupURLs(2011)
 	browser = utils.setupBrowser()
 
@@ -31,7 +39,11 @@ def scrape(scraper, website, scraperVersion):
 		for post in carPosts:
 			try:
 				title, price, location, odometer, link, images = scraper.getCarInfo(post)
-				db.post_raw(scraperVersion, website, title, price, location, odometer, link, images)
+				success = db.post_raw(scraperVersion, website, title, price, location, odometer, link, images)
+				if (success):
+					print("posted to db")
+				else:
+					print("failed to post to db")
 			except Exception as error:
 				print(error)
 				

From 7f893e09763783b7f5355407e177438bacf434d5 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 13:02:45 -0500
Subject: [PATCH 26/48] craigslist scraper functionality complete

---
 scrapers/craigslist.py | 16 +++++++++++-----
 scrapers/database.py   |  6 +++---
 scrapers/scrapers.py   |  6 ++++--
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py
index 1f4553d..82a1706 100644
--- a/scrapers/craigslist.py
+++ b/scrapers/craigslist.py
@@ -64,7 +64,7 @@ def processAttributes(attributes):
 	
 	for attr in attributes:
 		[label, value] = attr.split(": ")
-		processedAttributes.append({"label": label, "value": value})
+		processedAttributes.append({"label": label.replace(" ", "-").lower(), "value": value})
 
 	return processedAttributes
 
@@ -85,12 +85,18 @@ def scrapeListing(url):
 	try:
 		description = soup.find('section', id='postingbody').text
 		attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
-		
 		map = soup.find('div', id='map')
-		longitude = map["data-longitude"]
-		latitude = map["data-latitude"]
 
-		print([attributes, description, longitude, latitude])
+		car = {
+			"postBody": description,
+			"longitude": map["data-longitude"],
+			"latitude": map["data-latitude"]
+		}
+
+		for attr in attributes:
+			car[attr["label"]] = attr["value"]
+
+		return car
 	except:
 		print(f"Failed scraping {url}")		
 	
diff --git a/scrapers/database.py b/scrapers/database.py
index e168344..40285e2 100644
--- a/scrapers/database.py
+++ b/scrapers/database.py
@@ -24,8 +24,8 @@ def post_raw(scraperVersion, source, title, price, location, miles, link, images
   car = {
     "_id": link,
     "source": source,
-    "scraperVersion": scraperVersion,
-    "scrapeDate": str(date.today()),
+    "scraper-version": scraperVersion,
+    "scrape-date": str(date.today()),
     "title": title, 
     "price": price, 
     "location": location, 
@@ -61,7 +61,7 @@ def post_raw(scraperVersion, source, title, price, location, miles, link, images
 def update(link, newFields):
   conn = get_conn("Test")
   if (conn["success"]):
-    result = conn["db"]["raw"].update(
+    result = conn["db"]["raw"].update_one(
       {'_id': link},
       {
         '$set': newFields
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index aeaab0d..a3795e3 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -1,7 +1,8 @@
 import typer
+import utils
 import craigslist as cl
 import facebook as fb
-import utils
+import database as db
 
 app = typer.Typer()
 
@@ -19,7 +20,8 @@ def facebook():
 @app.command()
 def link(link: str):
 	if (".craigslist.org" in link):
-		cl.scrapeListing(link)
+		newInfo = cl.scrapeListing(link)
+		db.update(link, newInfo)
 	elif("https://www.facebook.com/marketplace" in link):
 		fb.scrapeListing(link)
 	else:

From b40d0c7c93d98e0f99a106b3639f65c12a628e24 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 13:49:46 -0500
Subject: [PATCH 27/48] extracted click function into utils

---
 scrapers/craigslist.py | 2 +-
 scrapers/utils.py      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/scrapers/craigslist.py b/scrapers/craigslist.py
index 82a1706..8dc92cc 100644
--- a/scrapers/craigslist.py
+++ b/scrapers/craigslist.py
@@ -19,7 +19,7 @@ def loadPageResources(driver):
 		scroll += 100
 		utils.scrollTo(scroll, driver)
 
-		driver.execute_script("arguments[0].click();", loadImgButtons[i])
+		utils.clickOn(loadImgButtons[i], driver)
 
 		time.sleep(.5)
 
diff --git a/scrapers/utils.py b/scrapers/utils.py
index a62a1bc..add41cf 100644
--- a/scrapers/utils.py
+++ b/scrapers/utils.py
@@ -9,6 +9,9 @@
 def scrollTo(x, driver):
 	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
 
+def clickOn(elem, driver):
+	driver.execute_script("arguments[0].click();", elem)
+
 def setupBrowser():
 	print("Setting up headless browser")
 

From 8d9b30a8525ed964bf3004ee6a9bf71d20c631b8 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Tue, 31 Oct 2023 13:50:06 -0500
Subject: [PATCH 28/48] Facebook listing scraper incomplete

---
 scrapers/facebook.py | 39 ++++++++++++++++++++++++++-------------
 scrapers/scrapers.py |  3 ++-
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/scrapers/facebook.py b/scrapers/facebook.py
index 5b8c43f..b11f937 100644
--- a/scrapers/facebook.py
+++ b/scrapers/facebook.py
@@ -9,6 +9,10 @@
 priceClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7 x1s688f xzsf02u"
 metaClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft"
 
+listingInfoClass = "x78zum5 xdt5ytf x1iyjqo2 x1n2onr6"
+listingSectionClass = "xod5an3"
+bodyClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u"
+
 def loadPageResources(driver):
 	scroll = 100
 
@@ -72,13 +76,13 @@ def processAttributes(attributes):
 	return processedAttributes
 
 def scrapeListing(url):
-	browser = setupBrowser()
+	browser = utils.setupBrowser()
 
 	# Navigate to the URL
-	print(f"Going to {url}")
-	browser.get(url) 
+	print(f"Going to {url[0:60]}")
+	browser.get(url[0:60]) 
 
-	print(f"Loading page for {url}")
+	print(f"Loading page for {url[0:60]}")
 	time.sleep(1)
 
 	# Create a BeautifulSoup object from the HTML of the page
@@ -86,16 +90,25 @@ def scrapeListing(url):
 	soup = BeautifulSoup(html, 'html.parser')
 
 	try:
-		description = soup.find('section', id='postingbody').text
-		attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
-		
-		map = soup.find('div', id='map')
-		longitude = map["data-longitude"]
-		latitude = map["data-latitude"]
+		seeMoreButton = browser.find_element("class name", "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x6prxxf xvq8zen x1s688f xzsf02u".replace(" ", "."))
+		utils.clickOn(seeMoreButton, browser)
+
+		listingInfo = soup.find('div', class_=listingInfoClass)
+		# description = listingInfo.find('span', class_="x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u")
+		print(listingInfo)
 
-		print([attributes, description, longitude, latitude])
-	except:
-		print(f"Failed scraping {url}")		
+		return 2
+
+		# attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
+		
+		# map = soup.find('div', id='map')
+		# longitude = map["data-longitude"]
+		# latitude = map["data-latitude"]
+
+		# print([attributes, description, longitude, latitude])
+	except Exception as error:
+		print(error)
+		return -1	
 	
 	# Close the Selenium WebDriver instance
 	browser.quit()
\ No newline at end of file
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index a3795e3..e3e1e73 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -23,7 +23,8 @@ def link(link: str):
 		newInfo = cl.scrapeListing(link)
 		db.update(link, newInfo)
 	elif("https://www.facebook.com/marketplace" in link):
-		fb.scrapeListing(link)
+		newInfo = fb.scrapeListing(link)
+		print(newInfo)
 	else:
 		print("Not a Craigslist nor a Facebook Marketplace link")
 

From e91838c39fc3a3792eca388647c8461f46812550 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 9 Nov 2023 13:04:10 -0600
Subject: [PATCH 29/48] added pipfile and pipfile.lock to manage dependencies

---
 scrapers/Pipfile      |  16 +++
 scrapers/Pipfile.lock | 281 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 297 insertions(+)
 create mode 100644 scrapers/Pipfile
 create mode 100644 scrapers/Pipfile.lock

diff --git a/scrapers/Pipfile b/scrapers/Pipfile
new file mode 100644
index 0000000..d054d18
--- /dev/null
+++ b/scrapers/Pipfile
@@ -0,0 +1,16 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+selenium = "*"
+bs4 = "*"
+pymongo = "*"
+typer = "*"
+python-dotenv = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.11"
diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock
new file mode 100644
index 0000000..00b7d4f
--- /dev/null
+++ b/scrapers/Pipfile.lock
@@ -0,0 +1,281 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "11dbba9e7645169d8dd9e6cfe9118716f9e853beec34798dce95b25c651c9695"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.11"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "attrs": {
+            "hashes": [
+                "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04",
+                "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==23.1.0"
+        },
+        "beautifulsoup4": {
+            "hashes": [
+                "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da",
+                "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"
+            ],
+            "markers": "python_full_version >= '3.6.0'",
+            "version": "==4.12.2"
+        },
+        "bs4": {
+            "hashes": [
+                "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
+            ],
+            "index": "pypi",
+            "version": "==0.0.1"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082",
+                "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2023.7.22"
+        },
+        "click": {
+            "hashes": [
+                "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28",
+                "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==8.1.7"
+        },
+        "dnspython": {
+            "hashes": [
+                "sha256:57c6fbaaeaaf39c891292012060beb141791735dbb4004798328fc2c467402d8",
+                "sha256:8dcfae8c7460a2f84b4072e26f1c9f4101ca20c071649cb7c34e8b6a93d58984"
+            ],
+            "markers": "python_version >= '3.8' and python_version < '4.0'",
+            "version": "==2.4.2"
+        },
+        "h11": {
+            "hashes": [
+                "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d",
+                "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==0.14.0"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
+                "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==3.4"
+        },
+        "outcome": {
+            "hashes": [
+                "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8",
+                "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==1.3.0.post0"
+        },
+        "pymongo": {
+            "hashes": [
+                "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330",
+                "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651",
+                "sha256:05c30fd35cc97f14f354916b45feea535d59060ef867446b5c3c7f9b609dd5dc",
+                "sha256:0634994b026336195778e5693583c060418d4ab453eff21530422690a97e1ee8",
+                "sha256:09c7de516b08c57647176b9fc21d929d628e35bcebc7422220c89ae40b62126a",
+                "sha256:107a234dc55affc5802acb3b6d83cbb8c87355b38a9457fcd8806bdeb8bce161",
+                "sha256:10a379fb60f1b2406ae57b8899bacfe20567918c8e9d2d545e1b93628fcf2050",
+                "sha256:128b1485753106c54af481789cdfea12b90a228afca0b11fb3828309a907e10e",
+                "sha256:1394c4737b325166a65ae7c145af1ebdb9fb153ebedd37cf91d676313e4a67b8",
+                "sha256:1c63e3a2e8fb815c4b1f738c284a4579897e37c3cfd95fdb199229a1ccfb638a",
+                "sha256:1e4ed21029d80c4f62605ab16398fe1ce093fff4b5f22d114055e7d9fbc4adb0",
+                "sha256:1ec71ac633b126c0775ed4604ca8f56c3540f5c21a1220639f299e7a544b55f9",
+                "sha256:21812453354b151200034750cd30b0140e82ec2a01fd4357390f67714a1bfbde",
+                "sha256:256c503a75bd71cf7fb9ebf889e7e222d49c6036a48aad5a619f98a0adf0e0d7",
+                "sha256:2703a9f8f5767986b4f51c259ff452cc837c5a83c8ed5f5361f6e49933743b2f",
+                "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a",
+                "sha256:2972dd1f1285866aba027eff2f4a2bbf8aa98563c2ced14cb34ee5602b36afdf",
+                "sha256:2973f113e079fb98515722cd728e1820282721ec9fd52830e4b73cabdbf1eb28",
+                "sha256:2ca0ba501898b2ec31e6c3acf90c31910944f01d454ad8e489213a156ccf1bda",
+                "sha256:2d2be5c9c3488fa8a70f83ed925940f488eac2837a996708d98a0e54a861f212",
+                "sha256:2f8c04277d879146eacda920476e93d520eff8bec6c022ac108cfa6280d84348",
+                "sha256:325701ae7b56daa5b0692305b7cb505ca50f80a1288abb32ff420a8a209b01ca",
+                "sha256:3729b8db02063da50eeb3db88a27670d85953afb9a7f14c213ac9e3dca93034b",
+                "sha256:3919708594b86d0f5cdc713eb6fccd3f9b9532af09ea7a5d843c933825ef56c4",
+                "sha256:39a1cd5d383b37285641d5a7a86be85274466ae336a61b51117155936529f9b3",
+                "sha256:3ec6c20385c5a58e16b1ea60c5e4993ea060540671d7d12664f385f2fb32fe79",
+                "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4",
+                "sha256:49f2af6cf82509b15093ce3569229e0d53c90ad8ae2eef940652d4cf1f81e045",
+                "sha256:4a0269811661ba93c472c8a60ea82640e838c2eb148d252720a09b5123f2c2fe",
+                "sha256:518c90bdd6e842c446d01a766b9136fec5ec6cc94f3b8c3f8b4a332786ee6b64",
+                "sha256:5717a308a703dda2886a5796a07489c698b442f5e409cf7dc2ac93de8d61d764",
+                "sha256:5802acc012bbb4bce4dff92973dff76482f30ef35dd4cb8ab5b0e06aa8f08c80",
+                "sha256:5e63146dbdb1eac207464f6e0cfcdb640c9c5ff0f57b754fa96fe252314a1dc6",
+                "sha256:6695d7136a435c1305b261a9ddb9b3ecec9863e05aab3935b96038145fd3a977",
+                "sha256:680fa0fc719e1a3dcb81130858368f51d83667d431924d0bcf249644bce8f303",
+                "sha256:6b18276f14b4b6d92e707ab6db19b938e112bd2f1dc3f9f1a628df58e4fd3f0d",
+                "sha256:6bafea6061d63059d8bc2ffc545e2f049221c8a4457d236c5cd6a66678673eab",
+                "sha256:6d6a1b1361f118e7fefa17ae3114e77f10ee1b228b20d50c47c9f351346180c8",
+                "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34",
+                "sha256:79f41576b3022c2fe9780ae3e44202b2438128a25284a8ddfa038f0785d87019",
+                "sha256:7b0e6361754ac596cd16bfc6ed49f69ffcd9b60b7bc4bcd3ea65c6a83475e4ff",
+                "sha256:7e3b0127b260d4abae7b62203c4c7ef0874c901b55155692353db19de4b18bc4",
+                "sha256:7fc2bb8a74dcfcdd32f89528e38dcbf70a3a6594963d60dc9595e3b35b66e414",
+                "sha256:806e094e9e85d8badc978af8c95b69c556077f11844655cb8cd2d1758769e521",
+                "sha256:81dd1308bd5630d2bb5980f00aa163b986b133f1e9ed66c66ce2a5bc3572e891",
+                "sha256:82e620842e12e8cb4050d2643a81c8149361cd82c0a920fa5a15dc4ca8a4000f",
+                "sha256:85f2cdc400ee87f5952ebf2a117488f2525a3fb2e23863a8efe3e4ee9e54e4d1",
+                "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8",
+                "sha256:8adf014f2779992eba3b513e060d06f075f0ab2fb3ad956f413a102312f65cdf",
+                "sha256:9b0f98481ad5dc4cb430a60bbb8869f05505283b9ae1c62bdb65eb5e020ee8e3",
+                "sha256:9bea9138b0fc6e2218147e9c6ce1ff76ff8e29dc00bb1b64842bd1ca107aee9f",
+                "sha256:a09bfb51953930e7e838972ddf646c5d5f984992a66d79da6ba7f6a8d8a890cd",
+                "sha256:a0be99b599da95b7a90a918dd927b20c434bea5e1c9b3efc6a3c6cd67c23f813",
+                "sha256:a49aca4d961823b2846b739380c847e8964ff7ae0f0a683992b9d926054f0d6d",
+                "sha256:a4dc1319d0c162919ee7f4ee6face076becae2abbd351cc14f1fe70af5fb20d9",
+                "sha256:a8273e1abbcff1d7d29cbbb1ea7e57d38be72f1af3c597c854168508b91516c2",
+                "sha256:a8f7f9feecae53fa18d6a3ea7c75f9e9a1d4d20e5c3f9ce3fba83f07bcc4eee2",
+                "sha256:ad4f66fbb893b55f96f03020e67dcab49ffde0177c6565ccf9dec4fdf974eb61",
+                "sha256:af425f323fce1b07755edd783581e7283557296946212f5b1a934441718e7528",
+                "sha256:b14dd73f595199f4275bed4fb509277470d9b9059310537e3b3daba12b30c157",
+                "sha256:b4ad70d7cac4ca0c7b31444a0148bd3af01a2662fa12b1ad6f57cd4a04e21766",
+                "sha256:b80a4ee19b3442c57c38afa978adca546521a8822d663310b63ae2a7d7b13f3a",
+                "sha256:ba51129fcc510824b6ca6e2ce1c27e3e4d048b6e35d3ae6f7e517bed1b8b25ce",
+                "sha256:c011bd5ad03cc096f99ffcfdd18a1817354132c1331bed7a837a25226659845f",
+                "sha256:cc94f9fea17a5af8cf1a343597711a26b0117c0b812550d99934acb89d526ed2",
+                "sha256:ccd785fafa1c931deff6a7116e9a0d402d59fabe51644b0d0c268295ff847b25",
+                "sha256:d16a534da0e39785687b7295e2fcf9a339f4a20689024983d11afaa4657f8507",
+                "sha256:d3077a31633beef77d057c6523f5de7271ddef7bde5e019285b00c0cc9cac1e3",
+                "sha256:d603edea1ff7408638b2504905c032193b7dcee7af269802dbb35bc8c3310ed5",
+                "sha256:db082f728160369d9a6ed2e722438291558fc15ce06d0a7d696a8dad735c236b",
+                "sha256:ddef295aaf80cefb0c1606f1995899efcb17edc6b327eb6589e234e614b87756",
+                "sha256:e16ade71c93f6814d095d25cd6d28a90d63511ea396bd96e9ffcb886b278baaa",
+                "sha256:e3db7d833a7c38c317dc95b54e27f1d27012e031b45a7c24e360b53197d5f6e7",
+                "sha256:e5e193f89f4f8c1fe273f9a6e6df915092c9f2af6db2d1afb8bd53855025c11f",
+                "sha256:eb438a8bf6b695bf50d57e6a059ff09652a07968b2041178b3744ea785fcef9b",
+                "sha256:ebf02c32afa6b67e5861a27183dd98ed88419a94a2ab843cc145fb0bafcc5b28",
+                "sha256:ecd9e1fa97aa11bf67472220285775fa15e896da108f425e55d23d7540a712ce",
+                "sha256:ef67fedd863ffffd4adfd46d9d992b0f929c7f61a8307366d664d93517f2c78e",
+                "sha256:f28ae33dc5a0b9cee06e95fd420e42155d83271ab75964baf747ce959cac5f52",
+                "sha256:fb1c56d891f9e34303c451998ef62ba52659648bb0d75b03c5e4ac223a3342c2",
+                "sha256:fe03bf25fae4b95d8afe40004a321df644400fdcba4c8e5e1a19c1085b740888"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.7'",
+            "version": "==4.6.0"
+        },
+        "pysocks": {
+            "hashes": [
+                "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299",
+                "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5",
+                "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"
+            ],
+            "version": "==1.7.1"
+        },
+        "python-dotenv": {
+            "hashes": [
+                "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba",
+                "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.8'",
+            "version": "==1.0.0"
+        },
+        "selenium": {
+            "hashes": [
+                "sha256:22eab5a1724c73d51b240a69ca702997b717eee4ba1f6065bf5d6b44dba01d48",
+                "sha256:9e82cd1ac647fb73cf0d4a6e280284102aaa3c9d94f0fa6e6cc4b5db6a30afbf"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.8'",
+            "version": "==4.15.2"
+        },
+        "sniffio": {
+            "hashes": [
+                "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101",
+                "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==1.3.0"
+        },
+        "sortedcontainers": {
+            "hashes": [
+                "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88",
+                "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"
+            ],
+            "version": "==2.4.0"
+        },
+        "soupsieve": {
+            "hashes": [
+                "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690",
+                "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==2.5"
+        },
+        "trio": {
+            "hashes": [
+                "sha256:16f89f7dcc8f7b9dcdec1fcd863e0c039af6d0f9a22f8dfd56f75d75ec73fd48",
+                "sha256:bb4abb3f4af23f96679e7c8cdabb8b234520f2498550d2cf63ebfd95f2ce27fe"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==0.23.1"
+        },
+        "trio-websocket": {
+            "hashes": [
+                "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f",
+                "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==0.11.1"
+        },
+        "typer": {
+            "hashes": [
+                "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2",
+                "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.6'",
+            "version": "==0.9.0"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0",
+                "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==4.8.0"
+        },
+        "urllib3": {
+            "extras": [
+                "socks"
+            ],
+            "hashes": [
+                "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84",
+                "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.0.7"
+        },
+        "wsproto": {
+            "hashes": [
+                "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065",
+                "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"
+            ],
+            "markers": "python_full_version >= '3.7.0'",
+            "version": "==1.2.0"
+        }
+    },
+    "develop": {}
+}

From 0e1cd28cec2f4540dab707495507897fb9c068ff Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 9 Nov 2023 13:14:34 -0600
Subject: [PATCH 30/48] organize file structure

---
 scrapers/scrapers.py             | 8 ++++----
 scrapers/src/__init__.py         | 0
 scrapers/{ => src}/craigslist.py | 2 +-
 scrapers/{ => src}/database.py   | 0
 scrapers/{ => src}/facebook.py   | 2 +-
 scrapers/{ => src}/utils.py      | 9 ++++-----
 6 files changed, 10 insertions(+), 11 deletions(-)
 create mode 100644 scrapers/src/__init__.py
 rename scrapers/{ => src}/craigslist.py (95%)
 rename scrapers/{ => src}/database.py (100%)
 rename scrapers/{ => src}/facebook.py (96%)
 rename scrapers/{ => src}/utils.py (92%)

diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index e3e1e73..78a8b6b 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -1,8 +1,8 @@
 import typer
-import utils
-import craigslist as cl
-import facebook as fb
-import database as db
+from src import utils
+from src import craigslist as cl
+from src import facebook as fb
+from src import database as db
 
 app = typer.Typer()
 
diff --git a/scrapers/src/__init__.py b/scrapers/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrapers/craigslist.py b/scrapers/src/craigslist.py
similarity index 95%
rename from scrapers/craigslist.py
rename to scrapers/src/craigslist.py
index 8dc92cc..d08322c 100644
--- a/scrapers/craigslist.py
+++ b/scrapers/src/craigslist.py
@@ -1,6 +1,6 @@
 from bs4 import BeautifulSoup
 import time
-import utils
+from . import utils
 
 def loadPageResources(driver):
 	scroll = 100
diff --git a/scrapers/database.py b/scrapers/src/database.py
similarity index 100%
rename from scrapers/database.py
rename to scrapers/src/database.py
diff --git a/scrapers/facebook.py b/scrapers/src/facebook.py
similarity index 96%
rename from scrapers/facebook.py
rename to scrapers/src/facebook.py
index b11f937..291c7ec 100644
--- a/scrapers/facebook.py
+++ b/scrapers/src/facebook.py
@@ -1,6 +1,6 @@
 from bs4 import BeautifulSoup
 import time
-import utils
+from . import utils
 
 postClass = "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24"
 linkClass = "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv"
diff --git a/scrapers/utils.py b/scrapers/src/utils.py
similarity index 92%
rename from scrapers/utils.py
rename to scrapers/src/utils.py
index add41cf..6e1172a 100644
--- a/scrapers/utils.py
+++ b/scrapers/src/utils.py
@@ -1,10 +1,9 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
-import utils
-import database as db
+from . import database as db
 
-import craigslist
-import facebook
+from . import craigslist
+from . import facebook
 
 def scrollTo(x, driver):
 	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
@@ -28,7 +27,7 @@ def scrape(website, scraperVersion):
 		scraper = facebook
 
 	cityURLs = scraper.setupURLs(2011)
-	browser = utils.setupBrowser()
+	browser = setupBrowser()
 
 	for url in cityURLs:
 		print(f"Going to {url}")

From c2694a4e44252949834a9f4c12d975f0d91d88fc Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 9 Nov 2023 13:27:54 -0600
Subject: [PATCH 31/48] Update README with scraper instructions

---
 README.md | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0c0ecc7..7adf5a1 100644
--- a/README.md
+++ b/README.md
@@ -5,4 +5,27 @@ Senior Design Repository for the Statefarm Automotive Fraud Project
 Make a copy of the ``.env.example`` file and make the following changes.
 1. remove ``.example`` from the extension
 2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus)
-3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable
\ No newline at end of file
+3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable
+
+## Run Scrapers locally
+**Prerequisites**
+- python3
+- pipenv
+
+**Installing dependencies**
+Navigate to ``scrapers/`` and run
+```bash
+pipenv install
+```
+
+**Scraper Usage**
+```bash
+# Scrape Craigsist homepage
+python3 scrapers.py craigslist
+
+# Scrape Facebook Marketplace homepage
+python3 scrapers.py facebook
+
+# Scrape a specific carigslist or facebook car listing
+python3 scrapers.py link [LINK]
+```
\ No newline at end of file

From 816d18506c1f329c703a1371b3e43acf03caef5b Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 9 Nov 2023 13:30:17 -0600
Subject: [PATCH 32/48] pipenv shell in README

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7adf5a1..1092af9 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,11 @@ Make a copy of the ``.env.example`` file and make the following changes.
 - pipenv
 
 **Installing dependencies**
-Navigate to ``scrapers/`` and run
+Navigate to ``scrapers/`` and open the virtual environment using
+```bash
+pipenv shell
+```
+Then install dependencies using
 ```bash
 pipenv install
 ```

From 667183ddf84ecbe8954c8596193994be9e6586c4 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 9 Nov 2023 14:06:23 -0600
Subject: [PATCH 33/48] added pipfile scripts

---
 scrapers/Pipfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scrapers/Pipfile b/scrapers/Pipfile
index d054d18..e94f5d9 100644
--- a/scrapers/Pipfile
+++ b/scrapers/Pipfile
@@ -3,6 +3,11 @@ url = "https://pypi.org/simple"
 verify_ssl = true
 name = "pypi"
 
+[scripts]
+craigslist = "python3 scrapers.py craigslist"
+facebook = "python3 scrapers.py facebook"
+link = "python3 scrapers.py link"
+
 [packages]
 selenium = "*"
 bs4 = "*"

From f14de608917136a6de791a56b61c6cecc64d18b9 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 9 Nov 2023 14:25:58 -0600
Subject: [PATCH 34/48] update README with pipenv scripts

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1092af9..54a27fd 100644
--- a/README.md
+++ b/README.md
@@ -25,11 +25,11 @@ pipenv install
 **Scraper Usage**
 ```bash
 # Scrape Craigsist homepage
-python3 scrapers.py craigslist
+pipenv run craigslist
 
 # Scrape Facebook Marketplace homepage
-python3 scrapers.py facebook
+pipenv run facebook
 
 # Scrape a specific carigslist or facebook car listing
-python3 scrapers.py link [LINK]
+pipenv run link [LINK]
 ```
\ No newline at end of file

From dcd09598e21ccef6163ef9009ae31f05b260068c Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 9 Nov 2023 20:30:53 -0600
Subject: [PATCH 35/48] updated db and collection name

---
 scrapers/src/database.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scrapers/src/database.py b/scrapers/src/database.py
index 40285e2..5c47e9e 100644
--- a/scrapers/src/database.py
+++ b/scrapers/src/database.py
@@ -2,6 +2,8 @@
 import pymongo
 import os
 from datetime import date
+db = "scrape"
+collection = "scraped_raw"
 
 def get_conn(db):
   # load environment variable containing db uri (which includes username and password)
@@ -17,7 +19,6 @@ def get_conn(db):
     print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?")
     return {"success" : False, "db": 0}
 
-  # use a database named "Test"
   return {"success" : True, "db": client.get_database(db)}
 
 def post_raw(scraperVersion, source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None):
@@ -49,19 +50,19 @@ def post_raw(scraperVersion, source, title, price, location, miles, link, images
     for attr in attributes:
       car[attr["label"]] = attr["value"]
 
-  # Insert into collection called "scrape_test"
-  conn = get_conn("Test")
+  # Insert into collection called "scrape_raw"
+  conn = get_conn(db)
 
   if (conn["success"]):
-    result = conn["db"]["raw"].insert_one(car)
+    result = conn["db"][collection].insert_one(car)
     return result.acknowledged
   else:
     return False
 
 def update(link, newFields):
-  conn = get_conn("Test")
+  conn = get_conn(db)
   if (conn["success"]):
-    result = conn["db"]["raw"].update_one(
+    result = conn["db"][collection].update_one(
       {'_id': link},
       {
         '$set': newFields
@@ -70,4 +71,3 @@ def update(link, newFields):
     return result.acknowledged
   else:
     return False
-

From 3e202b04a5aeb044366d0b074c51a73bb4da9b68 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Sun, 12 Nov 2023 18:38:36 -0600
Subject: [PATCH 36/48] finally got docker running locally T-T

---
 scrapers/Dockerfile       | 22 ++++++++++++++++++++++
 scrapers/requirements.txt | 23 +++++++++++++++++++++++
 scrapers/scrapers.py      |  4 ++--
 scrapers/src/utils.py     | 22 +++++++++++++++++++---
 4 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 scrapers/Dockerfile
 create mode 100644 scrapers/requirements.txt

diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile
new file mode 100644
index 0000000..6a2f76a
--- /dev/null
+++ b/scrapers/Dockerfile
@@ -0,0 +1,22 @@
+FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build
+RUN yum install -y unzip && \
+    curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \
+    curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \
+    unzip /tmp/chromedriver-linux64.zip -d /opt/ && \
+    unzip /tmp/chrome-linux64.zip -d /opt/
+
+FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733
+RUN yum install atk cups-libs gtk3 libXcomposite alsa-lib \
+    libXcursor libXdamage libXext libXi libXrandr libXScrnSaver \
+    libXtst pango at-spi2-atk libXt xorg-x11-server-Xvfb \
+    xorg-x11-xauth dbus-glib dbus-glib-devel -y
+COPY --from=build /opt/chrome-linux64 /opt/chrome
+COPY --from=build /opt/chromedriver-linux64 /opt/
+
+COPY scrapers.py ./
+COPY src ./src
+COPY requirements.txt ./
+
+RUN pip install -r requirements.txt
+
+CMD [ "scrapers.craigslist" ]
\ No newline at end of file
diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt
new file mode 100644
index 0000000..defce39
--- /dev/null
+++ b/scrapers/requirements.txt
@@ -0,0 +1,23 @@
+-i https://pypi.org/simple
+attrs==23.1.0; python_version >= '3.7'
+beautifulsoup4==4.12.2; python_full_version >= '3.6.0'
+bs4==0.0.1
+certifi==2023.7.22; python_version >= '3.6'
+click==8.1.7; python_version >= '3.7'
+dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0'
+h11==0.14.0; python_version >= '3.7'
+idna==3.4; python_version >= '3.5'
+outcome==1.3.0.post0; python_version >= '3.7'
+pymongo==4.6.0; python_version >= '3.7'
+pysocks==1.7.1
+python-dotenv==1.0.0; python_version >= '3.8'
+selenium==4.15.2; python_version >= '3.8'
+sniffio==1.3.0; python_version >= '3.7'
+sortedcontainers==2.4.0
+soupsieve==2.5; python_version >= '3.8'
+trio==0.23.1; python_version >= '3.8'
+trio-websocket==0.11.1; python_version >= '3.7'
+typer==0.9.0; python_version >= '3.6'
+typing-extensions==4.8.0; python_version >= '3.8'
+urllib3[socks]==2.0.7; python_version >= '3.7'
+wsproto==1.2.0; python_full_version >= '3.7.0'
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index 78a8b6b..284caeb 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -10,11 +10,11 @@
 facebookScraperVersion = 1
 
 @app.command()
-def craigslist():
+def craigslist(event, context):
 	utils.scrape("craigslist", craigslistScraperVersion)
 
 @app.command()
-def facebook():
+def facebook(event, context):
 	utils.scrape("facebook", facebookScraperVersion)
 
 @app.command()
diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py
index 6e1172a..6d51bbe 100644
--- a/scrapers/src/utils.py
+++ b/scrapers/src/utils.py
@@ -11,14 +11,30 @@ def scrollTo(x, driver):
 def clickOn(elem, driver):
 	driver.execute_script("arguments[0].click();", elem)
 
+def createDriverOptions():
+	options = webdriver.ChromeOptions()
+	options.binary_location = '/opt/chrome/chrome'
+
+	options.add_argument("--headless=new")
+	options.add_argument("--headless=new")
+	options.add_argument('--no-sandbox')
+	options.add_argument("--disable-gpu")
+	options.add_argument("--window-size=1280x1696")
+	options.add_argument("--single-process")
+	options.add_argument("--disable-dev-shm-usage")
+	options.add_argument("--disable-dev-tools")
+	options.add_argument("--no-zygote")
+
+	return options
+
 def setupBrowser():
 	print("Setting up headless browser")
 
-	options = Options()
-	options.add_argument("--headless=new")
+	service = webdriver.ChromeService("/opt/chromedriver")
+	options = createDriverOptions()
 
 	print("Creating a new Selenium WebDriver instance")
-	return webdriver.Chrome(options=options)
+	return webdriver.Chrome(options=options, service=service)
 
 def scrape(website, scraperVersion):
 	if (website == 'craigslist'):

From 21e751c16d06e2624bec311b6b5efb540d4e9cab Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Sun, 12 Nov 2023 19:35:45 -0600
Subject: [PATCH 37/48] use environ.get instead of getenv

---
 .env.example => scrapers/.env.example | 0
 scrapers/src/database.py              | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename .env.example => scrapers/.env.example (100%)

diff --git a/.env.example b/scrapers/.env.example
similarity index 100%
rename from .env.example
rename to scrapers/.env.example
diff --git a/scrapers/src/database.py b/scrapers/src/database.py
index 5c47e9e..834c14c 100644
--- a/scrapers/src/database.py
+++ b/scrapers/src/database.py
@@ -8,7 +8,7 @@
 def get_conn(db):
   # load environment variable containing db uri (which includes username and password)
   load_dotenv()
-  db_uri = os.getenv("DB_URI")
+  db_uri = os.environ.get("DB_URI")
 
   # create a mongodb connection
   try:

From 5a5515f5ac45d804d82cda093dc5b05778055e66 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Fri, 17 Nov 2023 10:21:32 -0600
Subject: [PATCH 38/48] updated pipenv scripts to work for docker locally

---
 scrapers/Pipfile | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scrapers/Pipfile b/scrapers/Pipfile
index e94f5d9..4eeccc8 100644
--- a/scrapers/Pipfile
+++ b/scrapers/Pipfile
@@ -4,9 +4,11 @@ verify_ssl = true
 name = "pypi"
 
 [scripts]
-craigslist = "python3 scrapers.py craigslist"
-facebook = "python3 scrapers.py facebook"
-link = "python3 scrapers.py link"
+build = "docker build --platform linux/amd64 -t smare ."
+cont = "docker run --name smarecontainer -d smare:latest"
+exec = "docker exec -it smarecontainer"
+craigslist = "pipenv run exec python3 -c 'import scrapers; scrapers.craigslist(\"\",\"\")'"
+facebook = "pipenv run exec python3 -c 'import scrapers; scrapers.facebook(\"\",\"\")'"
 
 [packages]
 selenium = "*"

From 5e86f1c770707076b3e57dd83860958d4a8af4f5 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Fri, 17 Nov 2023 10:24:35 -0600
Subject: [PATCH 39/48] update README with new pipenv commands

---
 README.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 54a27fd..f2928fc 100644
--- a/README.md
+++ b/README.md
@@ -23,13 +23,19 @@ pipenv install
 ```
 
 **Scraper Usage**
+To create build a Docker Image use
+```bash
+pipenv run build
+```
+to run a docker container "smarecontainer" use
+```bash
+pipenv run cont
+```
+then
 ```bash
 # Scrape Craigsist homepage
 pipenv run craigslist
 
 # Scrape Facebook Marketplace homepage
 pipenv run facebook
-
-# Scrape a specific carigslist or facebook car listing
-pipenv run link [LINK]
 ```
\ No newline at end of file

From 81f4f40142a8baa37ba216563dea9131cbe8c330 Mon Sep 17 00:00:00 2001
From: Waseem Polus <69316929+waseem-polus@users.noreply.github.com>
Date: Fri, 17 Nov 2023 10:45:28 -0600
Subject: [PATCH 40/48] Add missing new lines

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f2928fc..1a0d75c 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Make a copy of the ``.env.example`` file and make the following changes.
 - python3
 - pipenv
 
-**Installing dependencies**
+**Installing dependencies**  
 Navigate to ``scrapers/`` and open the virtual environment using
 ```bash
 pipenv shell
@@ -22,7 +22,7 @@ Then install dependencies using
 pipenv install
 ```
 
-**Scraper Usage**
+**Scraper Usage**  
 To create build a Docker Image use
 ```bash
 pipenv run build
@@ -38,4 +38,4 @@ pipenv run craigslist
 
 # Scrape Facebook Marketplace homepage
 pipenv run facebook
-```
\ No newline at end of file
+```

From fd2cf6f33a792c264a67bf4816dc3c0fe0402d7c Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 30 Nov 2023 00:21:06 -0600
Subject: [PATCH 41/48] use regex to determine if facebook or craigslist link

---
 scrapers/Pipfile.lock     | 184 +++++++++++++++++++-------------------
 scrapers/requirements.txt |   8 +-
 scrapers/scrapers.py      |   8 +-
 3 files changed, 102 insertions(+), 98 deletions(-)

diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock
index 00b7d4f..689e92b 100644
--- a/scrapers/Pipfile.lock
+++ b/scrapers/Pipfile.lock
@@ -41,11 +41,11 @@
         },
         "certifi": {
             "hashes": [
-                "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082",
-                "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"
+                "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1",
+                "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474"
             ],
             "markers": "python_version >= '3.6'",
-            "version": "==2023.7.22"
+            "version": "==2023.11.17"
         },
         "click": {
             "hashes": [
@@ -73,11 +73,11 @@
         },
         "idna": {
             "hashes": [
-                "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
-                "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
+                "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca",
+                "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"
             ],
             "markers": "python_version >= '3.5'",
-            "version": "==3.4"
+            "version": "==3.6"
         },
         "outcome": {
             "hashes": [
@@ -89,91 +89,91 @@
         },
         "pymongo": {
             "hashes": [
-                "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330",
-                "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651",
-                "sha256:05c30fd35cc97f14f354916b45feea535d59060ef867446b5c3c7f9b609dd5dc",
-                "sha256:0634994b026336195778e5693583c060418d4ab453eff21530422690a97e1ee8",
-                "sha256:09c7de516b08c57647176b9fc21d929d628e35bcebc7422220c89ae40b62126a",
-                "sha256:107a234dc55affc5802acb3b6d83cbb8c87355b38a9457fcd8806bdeb8bce161",
-                "sha256:10a379fb60f1b2406ae57b8899bacfe20567918c8e9d2d545e1b93628fcf2050",
-                "sha256:128b1485753106c54af481789cdfea12b90a228afca0b11fb3828309a907e10e",
-                "sha256:1394c4737b325166a65ae7c145af1ebdb9fb153ebedd37cf91d676313e4a67b8",
-                "sha256:1c63e3a2e8fb815c4b1f738c284a4579897e37c3cfd95fdb199229a1ccfb638a",
-                "sha256:1e4ed21029d80c4f62605ab16398fe1ce093fff4b5f22d114055e7d9fbc4adb0",
-                "sha256:1ec71ac633b126c0775ed4604ca8f56c3540f5c21a1220639f299e7a544b55f9",
-                "sha256:21812453354b151200034750cd30b0140e82ec2a01fd4357390f67714a1bfbde",
-                "sha256:256c503a75bd71cf7fb9ebf889e7e222d49c6036a48aad5a619f98a0adf0e0d7",
-                "sha256:2703a9f8f5767986b4f51c259ff452cc837c5a83c8ed5f5361f6e49933743b2f",
-                "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a",
-                "sha256:2972dd1f1285866aba027eff2f4a2bbf8aa98563c2ced14cb34ee5602b36afdf",
-                "sha256:2973f113e079fb98515722cd728e1820282721ec9fd52830e4b73cabdbf1eb28",
-                "sha256:2ca0ba501898b2ec31e6c3acf90c31910944f01d454ad8e489213a156ccf1bda",
-                "sha256:2d2be5c9c3488fa8a70f83ed925940f488eac2837a996708d98a0e54a861f212",
-                "sha256:2f8c04277d879146eacda920476e93d520eff8bec6c022ac108cfa6280d84348",
-                "sha256:325701ae7b56daa5b0692305b7cb505ca50f80a1288abb32ff420a8a209b01ca",
-                "sha256:3729b8db02063da50eeb3db88a27670d85953afb9a7f14c213ac9e3dca93034b",
-                "sha256:3919708594b86d0f5cdc713eb6fccd3f9b9532af09ea7a5d843c933825ef56c4",
-                "sha256:39a1cd5d383b37285641d5a7a86be85274466ae336a61b51117155936529f9b3",
-                "sha256:3ec6c20385c5a58e16b1ea60c5e4993ea060540671d7d12664f385f2fb32fe79",
-                "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4",
-                "sha256:49f2af6cf82509b15093ce3569229e0d53c90ad8ae2eef940652d4cf1f81e045",
-                "sha256:4a0269811661ba93c472c8a60ea82640e838c2eb148d252720a09b5123f2c2fe",
-                "sha256:518c90bdd6e842c446d01a766b9136fec5ec6cc94f3b8c3f8b4a332786ee6b64",
-                "sha256:5717a308a703dda2886a5796a07489c698b442f5e409cf7dc2ac93de8d61d764",
-                "sha256:5802acc012bbb4bce4dff92973dff76482f30ef35dd4cb8ab5b0e06aa8f08c80",
-                "sha256:5e63146dbdb1eac207464f6e0cfcdb640c9c5ff0f57b754fa96fe252314a1dc6",
-                "sha256:6695d7136a435c1305b261a9ddb9b3ecec9863e05aab3935b96038145fd3a977",
-                "sha256:680fa0fc719e1a3dcb81130858368f51d83667d431924d0bcf249644bce8f303",
-                "sha256:6b18276f14b4b6d92e707ab6db19b938e112bd2f1dc3f9f1a628df58e4fd3f0d",
-                "sha256:6bafea6061d63059d8bc2ffc545e2f049221c8a4457d236c5cd6a66678673eab",
-                "sha256:6d6a1b1361f118e7fefa17ae3114e77f10ee1b228b20d50c47c9f351346180c8",
-                "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34",
-                "sha256:79f41576b3022c2fe9780ae3e44202b2438128a25284a8ddfa038f0785d87019",
-                "sha256:7b0e6361754ac596cd16bfc6ed49f69ffcd9b60b7bc4bcd3ea65c6a83475e4ff",
-                "sha256:7e3b0127b260d4abae7b62203c4c7ef0874c901b55155692353db19de4b18bc4",
-                "sha256:7fc2bb8a74dcfcdd32f89528e38dcbf70a3a6594963d60dc9595e3b35b66e414",
-                "sha256:806e094e9e85d8badc978af8c95b69c556077f11844655cb8cd2d1758769e521",
-                "sha256:81dd1308bd5630d2bb5980f00aa163b986b133f1e9ed66c66ce2a5bc3572e891",
-                "sha256:82e620842e12e8cb4050d2643a81c8149361cd82c0a920fa5a15dc4ca8a4000f",
-                "sha256:85f2cdc400ee87f5952ebf2a117488f2525a3fb2e23863a8efe3e4ee9e54e4d1",
-                "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8",
-                "sha256:8adf014f2779992eba3b513e060d06f075f0ab2fb3ad956f413a102312f65cdf",
-                "sha256:9b0f98481ad5dc4cb430a60bbb8869f05505283b9ae1c62bdb65eb5e020ee8e3",
-                "sha256:9bea9138b0fc6e2218147e9c6ce1ff76ff8e29dc00bb1b64842bd1ca107aee9f",
-                "sha256:a09bfb51953930e7e838972ddf646c5d5f984992a66d79da6ba7f6a8d8a890cd",
-                "sha256:a0be99b599da95b7a90a918dd927b20c434bea5e1c9b3efc6a3c6cd67c23f813",
-                "sha256:a49aca4d961823b2846b739380c847e8964ff7ae0f0a683992b9d926054f0d6d",
-                "sha256:a4dc1319d0c162919ee7f4ee6face076becae2abbd351cc14f1fe70af5fb20d9",
-                "sha256:a8273e1abbcff1d7d29cbbb1ea7e57d38be72f1af3c597c854168508b91516c2",
-                "sha256:a8f7f9feecae53fa18d6a3ea7c75f9e9a1d4d20e5c3f9ce3fba83f07bcc4eee2",
-                "sha256:ad4f66fbb893b55f96f03020e67dcab49ffde0177c6565ccf9dec4fdf974eb61",
-                "sha256:af425f323fce1b07755edd783581e7283557296946212f5b1a934441718e7528",
-                "sha256:b14dd73f595199f4275bed4fb509277470d9b9059310537e3b3daba12b30c157",
-                "sha256:b4ad70d7cac4ca0c7b31444a0148bd3af01a2662fa12b1ad6f57cd4a04e21766",
-                "sha256:b80a4ee19b3442c57c38afa978adca546521a8822d663310b63ae2a7d7b13f3a",
-                "sha256:ba51129fcc510824b6ca6e2ce1c27e3e4d048b6e35d3ae6f7e517bed1b8b25ce",
-                "sha256:c011bd5ad03cc096f99ffcfdd18a1817354132c1331bed7a837a25226659845f",
-                "sha256:cc94f9fea17a5af8cf1a343597711a26b0117c0b812550d99934acb89d526ed2",
-                "sha256:ccd785fafa1c931deff6a7116e9a0d402d59fabe51644b0d0c268295ff847b25",
-                "sha256:d16a534da0e39785687b7295e2fcf9a339f4a20689024983d11afaa4657f8507",
-                "sha256:d3077a31633beef77d057c6523f5de7271ddef7bde5e019285b00c0cc9cac1e3",
-                "sha256:d603edea1ff7408638b2504905c032193b7dcee7af269802dbb35bc8c3310ed5",
-                "sha256:db082f728160369d9a6ed2e722438291558fc15ce06d0a7d696a8dad735c236b",
-                "sha256:ddef295aaf80cefb0c1606f1995899efcb17edc6b327eb6589e234e614b87756",
-                "sha256:e16ade71c93f6814d095d25cd6d28a90d63511ea396bd96e9ffcb886b278baaa",
-                "sha256:e3db7d833a7c38c317dc95b54e27f1d27012e031b45a7c24e360b53197d5f6e7",
-                "sha256:e5e193f89f4f8c1fe273f9a6e6df915092c9f2af6db2d1afb8bd53855025c11f",
-                "sha256:eb438a8bf6b695bf50d57e6a059ff09652a07968b2041178b3744ea785fcef9b",
-                "sha256:ebf02c32afa6b67e5861a27183dd98ed88419a94a2ab843cc145fb0bafcc5b28",
-                "sha256:ecd9e1fa97aa11bf67472220285775fa15e896da108f425e55d23d7540a712ce",
-                "sha256:ef67fedd863ffffd4adfd46d9d992b0f929c7f61a8307366d664d93517f2c78e",
-                "sha256:f28ae33dc5a0b9cee06e95fd420e42155d83271ab75964baf747ce959cac5f52",
-                "sha256:fb1c56d891f9e34303c451998ef62ba52659648bb0d75b03c5e4ac223a3342c2",
-                "sha256:fe03bf25fae4b95d8afe40004a321df644400fdcba4c8e5e1a19c1085b740888"
+                "sha256:00c199e1c593e2c8b033136d7a08f0c376452bac8a896c923fcd6f419e07bdd2",
+                "sha256:010bc9aa90fd06e5cc52c8fac2c2fd4ef1b5f990d9638548dde178005770a5e8",
+                "sha256:026a24a36394dc8930cbcb1d19d5eb35205ef3c838a7e619e04bd170713972e7",
+                "sha256:061598cbc6abe2f382ab64c9caa83faa2f4c51256f732cdd890bcc6e63bfb67e",
+                "sha256:13552ca505366df74e3e2f0a4f27c363928f3dff0eef9f281eb81af7f29bc3c5",
+                "sha256:13d613c866f9f07d51180f9a7da54ef491d130f169e999c27e7633abe8619ec9",
+                "sha256:144a31391a39a390efce0c5ebcaf4bf112114af4384c90163f402cec5ede476b",
+                "sha256:1461199b07903fc1424709efafe379205bf5f738144b1a50a08b0396357b5abf",
+                "sha256:154b361dcb358ad377d5d40df41ee35f1cc14c8691b50511547c12404f89b5cb",
+                "sha256:1c5654bb8bb2bdb10e7a0bc3c193dd8b49a960b9eebc4381ff5a2043f4c3c441",
+                "sha256:1de3c6faf948f3edd4e738abdb4b76572b4f4fdfc1fed4dad02427e70c5a6219",
+                "sha256:1ed23b0e2dac6f84f44c8494fbceefe6eb5c35db5c1099f56ab78fc0d94ab3af",
+                "sha256:1f2b856518bfcfa316c8dae3d7b412aecacf2e8ba30b149f5eb3b63128d703b9",
+                "sha256:2346450a075625c4d6166b40a013b605a38b6b6168ce2232b192a37fb200d588",
+                "sha256:262356ea5fcb13d35fb2ab6009d3927bafb9504ef02339338634fffd8a9f1ae4",
+                "sha256:27b81ecf18031998ad7db53b960d1347f8f29e8b7cb5ea7b4394726468e4295e",
+                "sha256:2940aa20e9cc328e8ddeacea8b9a6f5ddafe0b087fedad928912e787c65b4909",
+                "sha256:2d4ccac3053b84a09251da8f5350bb684cbbf8c8c01eda6b5418417d0a8ab198",
+                "sha256:2dd2f6960ee3c9360bed7fb3c678be0ca2d00f877068556785ec2eb6b73d2414",
+                "sha256:3071ec998cc3d7b4944377e5f1217c2c44b811fae16f9a495c7a1ce9b42fb038",
+                "sha256:3094c7d2f820eecabadae76bfec02669567bbdd1730eabce10a5764778564f7b",
+                "sha256:30b2c9caf3e55c2e323565d1f3b7e7881ab87db16997dc0cbca7c52885ed2347",
+                "sha256:3177f783ae7e08aaf7b2802e0df4e4b13903520e8380915e6337cdc7a6ff01d8",
+                "sha256:31dab1f3e1d0cdd57e8df01b645f52d43cc1b653ed3afd535d2891f4fc4f9712",
+                "sha256:33bb16a07d3cc4e0aea37b242097cd5f7a156312012455c2fa8ca396953b11c4",
+                "sha256:349093675a2d3759e4fb42b596afffa2b2518c890492563d7905fac503b20daa",
+                "sha256:39d77d8bbb392fa443831e6d4ae534237b1f4eee6aa186f0cdb4e334ba89536e",
+                "sha256:3a7f02a58a0c2912734105e05dedbee4f7507e6f1bd132ebad520be0b11d46fd",
+                "sha256:3b287e814a01deddb59b88549c1e0c87cefacd798d4afc0c8bd6042d1c3d48aa",
+                "sha256:3c74f4725485f0a7a3862cfd374cc1b740cebe4c133e0c1425984bcdcce0f4bb",
+                "sha256:3cadf7f4c8e94d8a77874b54a63c80af01f4d48c4b669c8b6867f86a07ba994f",
+                "sha256:3d18a9b9b858ee140c15c5bfcb3e66e47e2a70a03272c2e72adda2482f76a6ad",
+                "sha256:3f0e6a6c807fa887a0c51cc24fe7ea51bb9e496fe88f00d7930063372c3664c3",
+                "sha256:4344c30025210b9fa80ec257b0e0aab5aa1d5cca91daa70d82ab97b482cc038e",
+                "sha256:4497d49d785482cc1a44a0ddf8830b036a468c088e72a05217f5b60a9e025012",
+                "sha256:547dc5d7f834b1deefda51aedb11a7af9c51c45e689e44e14aa85d44147c7657",
+                "sha256:5556e306713e2522e460287615d26c0af0fe5ed9d4f431dad35c6624c5d277e9",
+                "sha256:55dac73316e7e8c2616ba2e6f62b750918e9e0ae0b2053699d66ca27a7790105",
+                "sha256:56816e43c92c2fa8c11dc2a686f0ca248bea7902f4a067fa6cbc77853b0f041e",
+                "sha256:5bd94c503271e79917b27c6e77f7c5474da6930b3fb9e70a12e68c2dff386b9a",
+                "sha256:5ec31adc2e988fd7db3ab509954791bbc5a452a03c85e45b804b4bfc31fa221d",
+                "sha256:69247f7a2835fc0984bbf0892e6022e9a36aec70e187fcfe6cae6a373eb8c4de",
+                "sha256:6a0ae7a48a6ef82ceb98a366948874834b86c84e288dbd55600c1abfc3ac1d88",
+                "sha256:6a1810c2cbde714decf40f811d1edc0dae45506eb37298fd9d4247b8801509fe",
+                "sha256:76013fef1c9cd1cd00d55efde516c154aa169f2bf059b197c263a255ba8a9ddf",
+                "sha256:77e0df59b1a4994ad30c6d746992ae887f9756a43fc25dec2db515d94cf0222d",
+                "sha256:7bb0e9049e81def6829d09558ad12d16d0454c26cabe6efc3658e544460688d9",
+                "sha256:88beb444fb438385e53dc9110852910ec2a22f0eab7dd489e827038fdc19ed8d",
+                "sha256:8b47ebd89e69fbf33d1c2df79759d7162fc80c7652dacfec136dae1c9b3afac7",
+                "sha256:8d219b4508f71d762368caec1fc180960569766049bbc4d38174f05e8ef2fe5b",
+                "sha256:8ec75f35f62571a43e31e7bd11749d974c1b5cd5ea4a8388725d579263c0fdf6",
+                "sha256:9167e735379ec43d8eafa3fd675bfbb12e2c0464f98960586e9447d2cf2c7a83",
+                "sha256:9a710c184ba845afb05a6f876edac8f27783ba70e52d5eaf939f121fc13b2f59",
+                "sha256:9aafd036f6f2e5ad109aec92f8dbfcbe76cff16bad683eb6dd18013739c0b3ae",
+                "sha256:9c79d597fb3a7c93d7c26924db7497eba06d58f88f58e586aa69b2ad89fee0f8",
+                "sha256:a2831e05ce0a4df10c4ac5399ef50b9a621f90894c2a4d2945dc5658765514ed",
+                "sha256:a5e641f931c5cd95b376fd3c59db52770e17bec2bf86ef16cc83b3906c054845",
+                "sha256:b10d8cda9fc2fcdcfa4a000aa10413a2bf8b575852cd07cb8a595ed09689ca98",
+                "sha256:b435b13bb8e36be11b75f7384a34eefe487fe87a6267172964628e2b14ecf0a7",
+                "sha256:b7b1a83ce514700276a46af3d9e481ec381f05b64939effc9065afe18456a6b9",
+                "sha256:b8729dbf25eb32ad0dc0b9bd5e6a0d0b7e5c2dc8ec06ad171088e1896b522a74",
+                "sha256:bbed8cccebe1169d45cedf00461b2842652d476d2897fd1c42cf41b635d88746",
+                "sha256:c258dbacfff1224f13576147df16ce3c02024a0d792fd0323ac01bed5d3c545d",
+                "sha256:c30a9e06041fbd7a7590693ec5e407aa8737ad91912a1e70176aff92e5c99d20",
+                "sha256:c91ea3915425bd4111cb1b74511cdc56d1d16a683a48bf2a5a96b6a6c0f297f7",
+                "sha256:d0355cff58a4ed6d5e5f6b9c3693f52de0784aa0c17119394e2a8e376ce489d4",
+                "sha256:d483793a384c550c2d12cb794ede294d303b42beff75f3b3081f57196660edaf",
+                "sha256:d4c2be9760b112b1caf649b4977b81b69893d75aa86caf4f0f398447be871f3c",
+                "sha256:d8e62d06e90f60ea2a3d463ae51401475568b995bafaffd81767d208d84d7bb1",
+                "sha256:da08ea09eefa6b960c2dd9a68ec47949235485c623621eb1d6c02b46765322ac",
+                "sha256:dd1fa413f8b9ba30140de198e4f408ffbba6396864c7554e0867aa7363eb58b2",
+                "sha256:e2aced6fb2f5261b47d267cb40060b73b6527e64afe54f6497844c9affed5fd0",
+                "sha256:e438417ce1dc5b758742e12661d800482200b042d03512a8f31f6aaa9137ad40",
+                "sha256:e470fa4bace5f50076c32f4b3cc182b31303b4fefb9b87f990144515d572820b",
+                "sha256:eaf2f65190c506def2581219572b9c70b8250615dc918b3b7c218361a51ec42e",
+                "sha256:ef102a67ede70e1721fe27f75073b5314911dbb9bc27cde0a1c402a11531e7bd",
+                "sha256:ef801027629c5b511cf2ba13b9be29bfee36ae834b2d95d9877818479cdc99ea",
+                "sha256:f7acc03a4f1154ba2643edeb13658d08598fe6e490c3dd96a241b94f09801626",
+                "sha256:f9756f1d25454ba6a3c2f1ef8b7ddec23e5cdeae3dc3c3377243ae37a383db00",
+                "sha256:ff62ba8ff70f01ab4fe0ae36b2cb0b5d1f42e73dfc81ddf0758cd9f77331ad25",
+                "sha256:ff925f1cca42e933376d09ddc254598f8c5fcd36efc5cac0118bb36c36217c41"
             ],
             "index": "pypi",
             "markers": "python_version >= '3.7'",
-            "version": "==4.6.0"
+            "version": "==4.6.1"
         },
         "pysocks": {
             "hashes": [
@@ -262,11 +262,11 @@
                 "socks"
             ],
             "hashes": [
-                "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84",
-                "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e"
+                "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3",
+                "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54"
             ],
-            "markers": "python_version >= '3.7'",
-            "version": "==2.0.7"
+            "markers": "python_version >= '3.8'",
+            "version": "==2.1.0"
         },
         "wsproto": {
             "hashes": [
diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt
index defce39..e0430c4 100644
--- a/scrapers/requirements.txt
+++ b/scrapers/requirements.txt
@@ -2,13 +2,13 @@
 attrs==23.1.0; python_version >= '3.7'
 beautifulsoup4==4.12.2; python_full_version >= '3.6.0'
 bs4==0.0.1
-certifi==2023.7.22; python_version >= '3.6'
+certifi==2023.11.17; python_version >= '3.6'
 click==8.1.7; python_version >= '3.7'
 dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0'
 h11==0.14.0; python_version >= '3.7'
-idna==3.4; python_version >= '3.5'
+idna==3.6; python_version >= '3.5'
 outcome==1.3.0.post0; python_version >= '3.7'
-pymongo==4.6.0; python_version >= '3.7'
+pymongo==4.6.1; python_version >= '3.7'
 pysocks==1.7.1
 python-dotenv==1.0.0; python_version >= '3.8'
 selenium==4.15.2; python_version >= '3.8'
@@ -19,5 +19,5 @@ trio==0.23.1; python_version >= '3.8'
 trio-websocket==0.11.1; python_version >= '3.7'
 typer==0.9.0; python_version >= '3.6'
 typing-extensions==4.8.0; python_version >= '3.8'
-urllib3[socks]==2.0.7; python_version >= '3.7'
+urllib3[socks]==2.1.0; python_version >= '3.8'
 wsproto==1.2.0; python_full_version >= '3.7.0'
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index 284caeb..36f61e0 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -1,4 +1,5 @@
 import typer
+import re
 from src import utils
 from src import craigslist as cl
 from src import facebook as fb
@@ -19,10 +20,13 @@ def facebook(event, context):
 
 @app.command()
 def link(link: str):
-	if (".craigslist.org" in link):
+	clPattern = re.compile(r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$")
+	fbPattern = re.compile(r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$")
+
+	if (clPattern.match(link)):
 		newInfo = cl.scrapeListing(link)
 		db.update(link, newInfo)
-	elif("https://www.facebook.com/marketplace" in link):
+	elif(fbPattern.match(link)):
 		newInfo = fb.scrapeListing(link)
 		print(newInfo)
 	else:

From cd2af8c1f04a6ff674cf808cf4af05324979654d Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 30 Nov 2023 00:41:28 -0600
Subject: [PATCH 42/48] added new lines to fix linting erros

---
 scrapers/scrapers.py       | 2 +-
 scrapers/src/craigslist.py | 2 +-
 scrapers/src/database.py   | 1 +
 scrapers/src/facebook.py   | 2 +-
 scrapers/src/utils.py      | 3 +--
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index 36f61e0..deb437e 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -33,4 +33,4 @@ def link(link: str):
 		print("Not a Craigslist nor a Facebook Marketplace link")
 
 if __name__ == "__main__":
-    app()
\ No newline at end of file
+    app()
diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py
index d08322c..64ee544 100644
--- a/scrapers/src/craigslist.py
+++ b/scrapers/src/craigslist.py
@@ -101,4 +101,4 @@ def scrapeListing(url):
 		print(f"Failed scraping {url}")		
 	
 	# Close the Selenium WebDriver instance
-	browser.quit()
\ No newline at end of file
+	browser.quit()
diff --git a/scrapers/src/database.py b/scrapers/src/database.py
index 834c14c..5cbd3a2 100644
--- a/scrapers/src/database.py
+++ b/scrapers/src/database.py
@@ -2,6 +2,7 @@
 import pymongo
 import os
 from datetime import date
+
 db = "scrape"
 collection = "scraped_raw"
 
diff --git a/scrapers/src/facebook.py b/scrapers/src/facebook.py
index 291c7ec..c40694e 100644
--- a/scrapers/src/facebook.py
+++ b/scrapers/src/facebook.py
@@ -111,4 +111,4 @@ def scrapeListing(url):
 		return -1	
 	
 	# Close the Selenium WebDriver instance
-	browser.quit()
\ No newline at end of file
+	browser.quit()
diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py
index 6d51bbe..8651c74 100644
--- a/scrapers/src/utils.py
+++ b/scrapers/src/utils.py
@@ -1,7 +1,6 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from . import database as db
-
 from . import craigslist
 from . import facebook
 
@@ -65,4 +64,4 @@ def scrape(website, scraperVersion):
 			except Exception as error:
 				print(error)
 				
-	browser.quit()
\ No newline at end of file
+	browser.quit()

From 6ea8e12c05c8db68494f5c39eae3d3f8d41b09e5 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 30 Nov 2023 01:05:21 -0600
Subject: [PATCH 43/48] fixed import order and spacing for isort

---
 scrapers/scrapers.py       | 8 +++++---
 scrapers/src/craigslist.py | 5 ++++-
 scrapers/src/database.py   | 5 +++--
 scrapers/src/facebook.py   | 4 +++-
 scrapers/src/utils.py      | 4 +++-
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index deb437e..c4f4996 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -1,9 +1,11 @@
-import typer
 import re
-from src import utils
+
+import typer
+
 from src import craigslist as cl
-from src import facebook as fb
 from src import database as db
+from src import facebook as fb
+from src import utils
 
 app = typer.Typer()
 
diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py
index 64ee544..d4b3358 100644
--- a/scrapers/src/craigslist.py
+++ b/scrapers/src/craigslist.py
@@ -1,7 +1,10 @@
-from bs4 import BeautifulSoup
 import time
+
+from bs4 import BeautifulSoup
+
 from . import utils
 
+
 def loadPageResources(driver):
 	scroll = 100
 
diff --git a/scrapers/src/database.py b/scrapers/src/database.py
index 5cbd3a2..8143c2a 100644
--- a/scrapers/src/database.py
+++ b/scrapers/src/database.py
@@ -1,8 +1,9 @@
-from dotenv import load_dotenv
-import pymongo
 import os
 from datetime import date
 
+import pymongo
+from dotenv import load_dotenv
+
 db = "scrape"
 collection = "scraped_raw"
 
diff --git a/scrapers/src/facebook.py b/scrapers/src/facebook.py
index c40694e..31752d8 100644
--- a/scrapers/src/facebook.py
+++ b/scrapers/src/facebook.py
@@ -1,5 +1,7 @@
-from bs4 import BeautifulSoup
 import time
+
+from bs4 import BeautifulSoup
+
 from . import utils
 
 postClass = "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24"
diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py
index 8651c74..17dbaef 100644
--- a/scrapers/src/utils.py
+++ b/scrapers/src/utils.py
@@ -1,9 +1,11 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
-from . import database as db
+
 from . import craigslist
+from . import database as db
 from . import facebook
 
+
 def scrollTo(x, driver):
 	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
 

From a8ab8b70fadcc5e0ce8fc591fc7c79bb4aeb982d Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 30 Nov 2023 01:39:47 -0600
Subject: [PATCH 44/48] fixed flake8 errors with black linter

---
 scrapers/.flake8           |   2 +
 scrapers/scrapers.py       |  34 ++++---
 scrapers/src/craigslist.py | 189 +++++++++++++++++++++--------------
 scrapers/src/database.py   | 140 ++++++++++++++------------
 scrapers/src/facebook.py   | 198 ++++++++++++++++++++++---------------
 scrapers/src/utils.py      | 114 ++++++++++++---------
 6 files changed, 399 insertions(+), 278 deletions(-)
 create mode 100644 scrapers/.flake8

diff --git a/scrapers/.flake8 b/scrapers/.flake8
new file mode 100644
index 0000000..79a16af
--- /dev/null
+++ b/scrapers/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 120
\ No newline at end of file
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index c4f4996..5d93a24 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -12,27 +12,35 @@
 craigslistScraperVersion = 1
 facebookScraperVersion = 1
 
+
 @app.command()
 def craigslist(event, context):
-	utils.scrape("craigslist", craigslistScraperVersion)
+    utils.scrape("craigslist", craigslistScraperVersion)
+
 
 @app.command()
 def facebook(event, context):
-	utils.scrape("facebook", facebookScraperVersion)
+    utils.scrape("facebook", facebookScraperVersion)
+
 
 @app.command()
 def link(link: str):
-	clPattern = re.compile(r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$")
-	fbPattern = re.compile(r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$")
-
-	if (clPattern.match(link)):
-		newInfo = cl.scrapeListing(link)
-		db.update(link, newInfo)
-	elif(fbPattern.match(link)):
-		newInfo = fb.scrapeListing(link)
-		print(newInfo)
-	else:
-		print("Not a Craigslist nor a Facebook Marketplace link")
+    clPattern = re.compile(
+        r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$"
+    )
+    fbPattern = re.compile(
+        r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$"
+    )
+
+    if clPattern.match(link):
+        newInfo = cl.scrapeListing(link)
+        db.update(link, newInfo)
+    elif fbPattern.match(link):
+        newInfo = fb.scrapeListing(link)
+        print(newInfo)
+    else:
+        print("Not a Craigslist nor a Facebook Marketplace link")
+
 
 if __name__ == "__main__":
     app()
diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py
index d4b3358..13e30b9 100644
--- a/scrapers/src/craigslist.py
+++ b/scrapers/src/craigslist.py
@@ -6,102 +6,143 @@
 
 
 def loadPageResources(driver):
-	scroll = 100
+    scroll = 100
 
-	print("Waiting to load...")
-	time.sleep(2)
+    print("Waiting to load...")
+    time.sleep(2)
 
-	utils.scrollTo(scroll, driver)
+    utils.scrollTo(scroll, driver)
 
-	loadImgButtons = driver.find_elements("class name", "slider-back-arrow")
+    loadImgButtons = driver.find_elements("class name", "slider-back-arrow")
 
-	time.sleep(2)
+    time.sleep(2)
 
-	# Emulate a user scrolling
-	for i in range(len(loadImgButtons)):
-		scroll += 100
-		utils.scrollTo(scroll, driver)
+    # Emulate a user scrolling
+    for i in range(len(loadImgButtons)):
+        scroll += 100
+        utils.scrollTo(scroll, driver)
 
-		utils.clickOn(loadImgButtons[i], driver)
+        utils.clickOn(loadImgButtons[i], driver)
 
-		time.sleep(.5)
+        time.sleep(0.5)
 
 
 def setupURLs(oldestAllowedCars):
-	# List of TX cities to scrape; can be expanded
-	cities = ["abilene", "amarillo", "austin", "beaumont", "brownsville", "collegestation", "corpuschristi", "dallas", "nacogdoches", "delrio", "elpaso", "galveston", "houston", "killeen", "laredo", "lubbock", "mcallen", "odessa", "sanangelo", "sanantonio", "sanmarcos", "bigbend", "texoma", "easttexas", "victoriatx", "waco", "wichitafalls"]
+    # List of TX cities to scrape; can be expanded
+    cities = [
+        "abilene",
+        "amarillo",
+        "austin",
+        "beaumont",
+        "brownsville",
+        "collegestation",
+        "corpuschristi",
+        "dallas",
+        "nacogdoches",
+        "delrio",
+        "elpaso",
+        "galveston",
+        "houston",
+        "killeen",
+        "laredo",
+        "lubbock",
+        "mcallen",
+        "odessa",
+        "sanangelo",
+        "sanantonio",
+        "sanmarcos",
+        "bigbend",
+        "texoma",
+        "easttexas",
+        "victoriatx",
+        "waco",
+        "wichitafalls",
+    ]
+
+    # Set the URL of the Facebook Marketplace automotive category
+    base_url = (
+        "https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0"
+    )
+    return [base_url.format(city, oldestAllowedCars) for city in cities]
 
-	# Set the URL of the Facebook Marketplace automotive category
-	base_url = 'https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0'
-	return [base_url.format(city, oldestAllowedCars) for city in cities]
 
 def getAllPosts(browser):
-	# Create a BeautifulSoup object from the HTML of the page
-	html = browser.page_source
-	soup = BeautifulSoup(html, 'html.parser')
+    # Create a BeautifulSoup object from the HTML of the page
+    html = browser.page_source
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Find all of the car listings on the page
+    return soup.find_all("div", class_="gallery-card")
 
-	# Find all of the car listings on the page
-	return soup.find_all('div', class_='gallery-card')
 
 def getCarInfo(post):
-	title = post.find('span', class_='label').text
+    title = post.find("span", class_="label").text
+
+    print(f'Scraping "{title}"')
+
+    price = post.find("span", class_="priceinfo").text
+    metadata = post.find("div", class_="meta").text.split("·")
 
-	print(f'Scraping "{title}"')
+    odometer = metadata[1]
+    if len(metadata) >= 3:
+        location = metadata[2]
 
-	price = post.find('span', class_='priceinfo').text
-	metadata = post.find('div', class_="meta").text.split('·')
+    link = post.find("a", class_="posting-title", href=True)["href"]
 
-	odometer = metadata[1]
-	if (len(metadata) >= 3):
-		location = metadata[2]
-	
-	link = post.find('a', class_='posting-title', href=True)["href"]
-	
-	imageElements = post.findAll('img')
-	images = [img["src"] for img in imageElements]
+    imageElements = post.findAll("img")
+    images = [img["src"] for img in imageElements]
+
+    return title, price, location, odometer, link, images
 
-	return title, price, location, odometer, link, images
 
 def processAttributes(attributes):
-	processedAttributes = []
-	
-	for attr in attributes:
-		[label, value] = attr.split(": ")
-		processedAttributes.append({"label": label.replace(" ", "-").lower(), "value": value})
+    processedAttributes = []
+
+    for attr in attributes:
+        [label, value] = attr.split(": ")
+        processedAttributes.append(
+            {"label": label.replace(" ", "-").lower(), "value": value}
+        )
+
+    return processedAttributes
 
-	return processedAttributes
 
 def scrapeListing(url):
-	browser = utils.setupBrowser()
-
-	# Navigate to the URL
-	print(f"Going to {url}")
-	browser.get(url) 
-
-	print(f"Loading page for {url}")
-	time.sleep(1)
-
-	# Create a BeautifulSoup object from the HTML of the page
-	html = browser.page_source
-	soup = BeautifulSoup(html, 'html.parser')
-
-	try:
-		description = soup.find('section', id='postingbody').text
-		attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
-		map = soup.find('div', id='map')
-
-		car = {
-			"postBody": description,
-			"longitude": map["data-longitude"],
-			"latitude": map["data-latitude"]
-		}
-
-		for attr in attributes:
-			car[attr["label"]] = attr["value"]
-
-		return car
-	except:
-		print(f"Failed scraping {url}")		
-	
-	# Close the Selenium WebDriver instance
-	browser.quit()
+    browser = utils.setupBrowser()
+
+    # Navigate to the URL
+    print(f"Going to {url}")
+    browser.get(url)
+
+    print(f"Loading page for {url}")
+    time.sleep(1)
+
+    # Create a BeautifulSoup object from the HTML of the page
+    html = browser.page_source
+    soup = BeautifulSoup(html, "html.parser")
+
+    try:
+        description = soup.find("section", id="postingbody").text
+        attributes = processAttributes(
+            [
+                attr.text
+                for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span")
+            ]
+        )
+        map = soup.find("div", id="map")
+
+        car = {
+            "postBody": description,
+            "longitude": map["data-longitude"],
+            "latitude": map["data-latitude"],
+        }
+
+        for attr in attributes:
+            car[attr["label"]] = attr["value"]
+
+        return car
+    except Exception as e:
+        print(f"Failed scraping {url}: \n{e}")
+
+    # Close the Selenium WebDriver instance
+    browser.quit()
diff --git a/scrapers/src/database.py b/scrapers/src/database.py
index 8143c2a..1c88567 100644
--- a/scrapers/src/database.py
+++ b/scrapers/src/database.py
@@ -7,69 +7,83 @@
 db = "scrape"
 collection = "scraped_raw"
 
+
 def get_conn(db):
-  # load environment variable containing db uri (which includes username and password)
-  load_dotenv()
-  db_uri = os.environ.get("DB_URI")
-
-  # create a mongodb connection
-  try:
-    client = pymongo.MongoClient(db_uri)
-    
-  # return a friendly error if a URI error is thrown 
-  except pymongo.errors.ConfigurationError:
-    print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?")
-    return {"success" : False, "db": 0}
-
-  return {"success" : True, "db": client.get_database(db)}
-
-def post_raw(scraperVersion, source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None):
-  car = {
-    "_id": link,
-    "source": source,
-    "scraper-version": scraperVersion,
-    "scrape-date": str(date.today()),
-    "title": title, 
-    "price": price, 
-    "location": location, 
-    "odometer": miles, 
-    "link": link
-  }
-
-  if (images is not None):
-    car["images"] = images
-  
-  if (postBody is not None):
-    car["postBody"] = postBody
-
-  if (longitude is not None):
-    car["longitude"] = longitude
-  
-  if (latitude is not None):
-    car["latitude"] = latitude
-  
-  if (attributes is not None):
-    for attr in attributes:
-      car[attr["label"]] = attr["value"]
-
-  # Insert into collection called "scrape_raw"
-  conn = get_conn(db)
-
-  if (conn["success"]):
-    result = conn["db"][collection].insert_one(car)
-    return result.acknowledged
-  else:
-    return False
+    # load environment variable containing db uri (which includes username and password)
+    load_dotenv()
+    db_uri = os.environ.get("DB_URI")
+
+    # create a mongodb connection
+    try:
+        client = pymongo.MongoClient(db_uri)
+
+    # return a friendly error if a URI error is thrown
+    except pymongo.errors.ConfigurationError:
+        print(
+            "An Invalid URI host error was received."
+            " Is your Atlas host name correct in your connection string (found the .env)?"
+        )
+        return {"success": False, "db": 0}
+
+    return {"success": True, "db": client.get_database(db)}
+
+
+def post_raw(
+    scraperVersion,
+    source,
+    title,
+    price,
+    location,
+    miles,
+    link,
+    images=None,
+    postBody=None,
+    longitude=None,
+    latitude=None,
+    attributes=None,
+):
+    car = {
+        "_id": link,
+        "source": source,
+        "scraper-version": scraperVersion,
+        "scrape-date": str(date.today()),
+        "title": title,
+        "price": price,
+        "location": location,
+        "odometer": miles,
+        "link": link,
+    }
+
+    if images is not None:
+        car["images"] = images
+
+    if postBody is not None:
+        car["postBody"] = postBody
+
+    if longitude is not None:
+        car["longitude"] = longitude
+
+    if latitude is not None:
+        car["latitude"] = latitude
+
+    if attributes is not None:
+        for attr in attributes:
+            car[attr["label"]] = attr["value"]
+
+    # Insert into collection called "scrape_raw"
+    conn = get_conn(db)
+
+    if conn["success"]:
+        result = conn["db"][collection].insert_one(car)
+        return result.acknowledged
+    else:
+        return False
+
 
 def update(link, newFields):
-  conn = get_conn(db)
-  if (conn["success"]):
-    result = conn["db"][collection].update_one(
-      {'_id': link},
-      {
-        '$set': newFields
-      }
-    )
-    return result.acknowledged
-  else:
-    return False
+    conn = get_conn(db)
+    if conn["success"]:
+        result = conn["db"][collection].update_one({"_id": link}, {"$set": newFields})
+        return result.acknowledged
+    else:
+        return False
diff --git a/scrapers/src/facebook.py b/scrapers/src/facebook.py
index 31752d8..61aef28 100644
--- a/scrapers/src/facebook.py
+++ b/scrapers/src/facebook.py
@@ -4,113 +4,153 @@
 
 from . import utils
 
-postClass = "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24"
-linkClass = "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv"
+postClass = (
+    "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4"
+    " x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24"
+)
+linkClass = (
+    "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619"
+    "x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r"
+    " xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq"
+    " x1a2a7pz x1heor9g x1lku1pv"
+)
 thumbnailClass = "xt7dq6l xl1xv1r x6ikm8r x10wlt62 xh8yej3"
 titleClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6"
-priceClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7 x1s688f xzsf02u"
+priceClass = (
+    "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel"
+    " x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i"
+    " x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7"
+    " x1s688f xzsf02u"
+)
 metaClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft"
 
 listingInfoClass = "x78zum5 xdt5ytf x1iyjqo2 x1n2onr6"
 listingSectionClass = "xod5an3"
-bodyClass = "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u"
+bodyClass = (
+    "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx"
+    " x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty"
+    " x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u"
+)
+
 
 def loadPageResources(driver):
-	scroll = 100
+    scroll = 100
 
-	print("Waiting to load...")
-	time.sleep(2)
-	utils.scrollTo(scroll, driver)
-	time.sleep(1.5)
+    print("Waiting to load...")
+    time.sleep(2)
+    utils.scrollTo(scroll, driver)
+    time.sleep(1.5)
 
-	# Emulate a user scrolling
-	for i in range(10):
-		scroll += 1000
-		utils.scrollTo(scroll, driver)
-		time.sleep(1)
+    # Emulate a user scrolling
+    for i in range(10):
+        scroll += 1000
+        utils.scrollTo(scroll, driver)
+        time.sleep(1)
 
 
 def setupURLs(oldestAllowedCars):
-	# List of TX cities to scrape; can be expanded
-	cities = ['houston', 'dallas', 'austin', 'fortworth', 'elpaso', 'sanantonio']
+    # List of TX cities to scrape; can be expanded
+    cities = ["houston", "dallas", "austin", "fortworth", "elpaso", "sanantonio"]
+
+    # Set the URL of the Facebook Marketplace automotive category
+    base_url = "https://www.facebook.com/marketplace/{}/vehicles?minYear={}&exact=false"
+    return [base_url.format(city, oldestAllowedCars) for city in cities]
 
-	# Set the URL of the Facebook Marketplace automotive category
-	base_url = 'https://www.facebook.com/marketplace/{}/vehicles?minYear={}&exact=false'
-	return [base_url.format(city, oldestAllowedCars) for city in cities]
 
 def getAllPosts(browser):
-	# Create a BeautifulSoup object from the HTML of the page
-	html = browser.page_source
-	soup = BeautifulSoup(html, 'html.parser')
+    # Create a BeautifulSoup object from the HTML of the page
+    html = browser.page_source
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Find all of the car listings on the page
+    return soup.find_all("div", class_=postClass)
 
-	# Find all of the car listings on the page
-	return soup.find_all('div', class_=postClass)
 
 def getCarInfo(post):
-	title = post.find('span', class_=titleClass).text
+    title = post.find("span", class_=titleClass).text
+
+    print(f'Scraping "{title}"')
 
-	print(f'Scraping "{title}"')
+    price = post.find("span", class_=priceClass).text
+    metadata = post.findAll("span", class_=metaClass)
 
-	price = post.find('span', class_=priceClass).text
-	metadata = post.findAll('span', class_=metaClass)
+    location = metadata[0].text
+    odometer = metadata[1].text
 
-	location = metadata[0].text
-	odometer = metadata[1].text
+    link = post.find("a", class_=linkClass, href=True)["href"]
+    link = "https://facebook.com" + link
 
-	link = post.find('a', class_=linkClass, href=True)["href"]
-	link = "https://facebook.com" + link
-	
-	thumbnail = post.find('img', class_=thumbnailClass)["src"]
+    thumbnail = post.find("img", class_=thumbnailClass)["src"]
+
+    return title, price, location, odometer, link, [thumbnail]
 
-	return title, price, location, odometer, link, [thumbnail]
 
 def getCarImages():
-	# class="x1a0syf3 x1ja2u2z"
-	return "TODO"
+    # class="x1a0syf3 x1ja2u2z"
+    return "TODO"
+
 
 def processAttributes(attributes):
-	processedAttributes = []
-	
-	for attr in attributes:
-		[label, value] = attr.split(": ")
-		processedAttributes.append({"label": label, "value": value})
+    processedAttributes = []
+
+    for attr in attributes:
+        [label, value] = attr.split(": ")
+        processedAttributes.append({"label": label, "value": value})
+
+    return processedAttributes
 
-	return processedAttributes
 
 def scrapeListing(url):
-	browser = utils.setupBrowser()
-
-	# Navigate to the URL
-	print(f"Going to {url[0:60]}")
-	browser.get(url[0:60]) 
-
-	print(f"Loading page for {url[0:60]}")
-	time.sleep(1)
-
-	# Create a BeautifulSoup object from the HTML of the page
-	html = browser.page_source
-	soup = BeautifulSoup(html, 'html.parser')
-
-	try:
-		seeMoreButton = browser.find_element("class name", "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x6prxxf xvq8zen x1s688f xzsf02u".replace(" ", "."))
-		utils.clickOn(seeMoreButton, browser)
-
-		listingInfo = soup.find('div', class_=listingInfoClass)
-		# description = listingInfo.find('span', class_="x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u")
-		print(listingInfo)
-
-		return 2
-
-		# attributes = processAttributes([attr.text for attr in soup.findAll('p', class_="attrgroup")[1].findAll('span')])
-		
-		# map = soup.find('div', id='map')
-		# longitude = map["data-longitude"]
-		# latitude = map["data-latitude"]
-
-		# print([attributes, description, longitude, latitude])
-	except Exception as error:
-		print(error)
-		return -1	
-	
-	# Close the Selenium WebDriver instance
-	browser.quit()
+    browser = utils.setupBrowser()
+
+    # Navigate to the URL
+    print(f"Going to {url[0:60]}")
+    browser.get(url[0:60])
+
+    print(f"Loading page for {url[0:60]}")
+    time.sleep(1)
+
+    # Create a BeautifulSoup object from the HTML of the page
+    html = browser.page_source
+    soup = BeautifulSoup(html, "html.parser")
+
+    try:
+        seeMoreButton = browser.find_element(
+            "class name",
+            "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x6prxxf xvq8zen x1s688f xzsf02u".replace(
+                " ", "."
+            ),
+        )
+        utils.clickOn(seeMoreButton, browser)
+
+        listingInfo = soup.find("div", class_=listingInfoClass)
+        # description = listingInfo.find(
+        #     "span",
+        #     class_=(
+        #         "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq"
+        #         " x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m"
+        #         " x6prxxf xvq8zen xo1l8bm xzsf02u"
+        #     ),
+        # )
+        print(listingInfo)
+
+        return 2
+
+        # attributes = processAttributes(
+        #     [
+        #         attr.text
+        #         for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span")
+        #     ]
+        # )
+
+        # map = soup.find('div', id='map')
+        # longitude = map["data-longitude"]
+        # latitude = map["data-latitude"]
+
+        # print([attributes, description, longitude, latitude])
+    except Exception as error:
+        print(error)
+        return -1
+
+    # Close the Selenium WebDriver instance
+    browser.quit()
diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py
index 17dbaef..399e64e 100644
--- a/scrapers/src/utils.py
+++ b/scrapers/src/utils.py
@@ -1,5 +1,4 @@
 from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
 
 from . import craigslist
 from . import database as db
@@ -7,63 +6,80 @@
 
 
 def scrollTo(x, driver):
-	driver.execute_script(f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})")
+    driver.execute_script(
+        f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})"
+    )
+
 
 def clickOn(elem, driver):
-	driver.execute_script("arguments[0].click();", elem)
+    driver.execute_script("arguments[0].click();", elem)
+
 
 def createDriverOptions():
-	options = webdriver.ChromeOptions()
-	options.binary_location = '/opt/chrome/chrome'
+    options = webdriver.ChromeOptions()
+    options.binary_location = "/opt/chrome/chrome"
 
-	options.add_argument("--headless=new")
-	options.add_argument("--headless=new")
-	options.add_argument('--no-sandbox')
-	options.add_argument("--disable-gpu")
-	options.add_argument("--window-size=1280x1696")
-	options.add_argument("--single-process")
-	options.add_argument("--disable-dev-shm-usage")
-	options.add_argument("--disable-dev-tools")
-	options.add_argument("--no-zygote")
+    options.add_argument("--headless=new")
+    options.add_argument("--headless=new")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-gpu")
+    options.add_argument("--window-size=1280x1696")
+    options.add_argument("--single-process")
+    options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("--disable-dev-tools")
+    options.add_argument("--no-zygote")
+
+    return options
 
-	return options
 
 def setupBrowser():
-	print("Setting up headless browser")
+    print("Setting up headless browser")
+
+    service = webdriver.ChromeService("/opt/chromedriver")
+    options = createDriverOptions()
 
-	service = webdriver.ChromeService("/opt/chromedriver")
-	options = createDriverOptions()
+    print("Creating a new Selenium WebDriver instance")
+    return webdriver.Chrome(options=options, service=service)
 
-	print("Creating a new Selenium WebDriver instance")
-	return webdriver.Chrome(options=options, service=service)
 
 def scrape(website, scraperVersion):
-	if (website == 'craigslist'):
-		scraper = craigslist
-	elif (website == 'facebook'):
-		scraper = facebook
-
-	cityURLs = scraper.setupURLs(2011)
-	browser = setupBrowser()
-
-	for url in cityURLs:
-		print(f"Going to {url}")
-		browser.get(url) 
-
-		print(f"Loading cars from {url}")
-		scraper.loadPageResources(browser)
-
-		carPosts = scraper.getAllPosts(browser)
-
-		for post in carPosts:
-			try:
-				title, price, location, odometer, link, images = scraper.getCarInfo(post)
-				success = db.post_raw(scraperVersion, website, title, price, location, odometer, link, images)
-				if (success):
-					print("posted to db")
-				else:
-					print("failed to post to db")
-			except Exception as error:
-				print(error)
-				
-	browser.quit()
+    if website == "craigslist":
+        scraper = craigslist
+    elif website == "facebook":
+        scraper = facebook
+
+    cityURLs = scraper.setupURLs(2011)
+    browser = setupBrowser()
+
+    for url in cityURLs:
+        print(f"Going to {url}")
+        browser.get(url)
+
+        print(f"Loading cars from {url}")
+        scraper.loadPageResources(browser)
+
+        carPosts = scraper.getAllPosts(browser)
+
+        for post in carPosts:
+            try:
+                title, price, location, odometer, link, images = scraper.getCarInfo(
+                    post
+                )
+                success = db.post_raw(
+                    scraperVersion,
+                    website,
+                    title,
+                    price,
+                    location,
+                    odometer,
+                    link,
+                    images,
+                )
+                if success:
+                    print("posted to db")
+                else:
+                    print("failed to post to db")
+            except Exception as error:
+                print(error)
+
+    browser.quit()

From eda33a90368da468b4469530bd764c1acb7f893f Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Thu, 30 Nov 2023 01:52:53 -0600
Subject: [PATCH 45/48] added flake8, black, and isort to dev dependencies

---
 scrapers/Dockerfile   |   3 +-
 scrapers/Pipfile      |   3 ++
 scrapers/Pipfile.lock | 112 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile
index 6a2f76a..39359a3 100644
--- a/scrapers/Dockerfile
+++ b/scrapers/Dockerfile
@@ -3,7 +3,8 @@ RUN yum install -y unzip && \
     curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \
     curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \
     unzip /tmp/chromedriver-linux64.zip -d /opt/ && \
-    unzip /tmp/chrome-linux64.zip -d /opt/
+    unzip /tmp/chrome-linux64.zip -d /opt/ && \
+    yup clean all
 
 FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733
 RUN yum install atk cups-libs gtk3 libXcomposite alsa-lib \
diff --git a/scrapers/Pipfile b/scrapers/Pipfile
index 4eeccc8..0ecb354 100644
--- a/scrapers/Pipfile
+++ b/scrapers/Pipfile
@@ -18,6 +18,9 @@ typer = "*"
 python-dotenv = "*"
 
 [dev-packages]
+isort = "*"
+black = "*"
+flake8 = "*"
 
 [requires]
 python_version = "3.11"
diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock
index 689e92b..bb2797e 100644
--- a/scrapers/Pipfile.lock
+++ b/scrapers/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "11dbba9e7645169d8dd9e6cfe9118716f9e853beec34798dce95b25c651c9695"
+            "sha256": "716098b2b29f4b98c932bd4554e3953a184fea6603a7d3f17e7bd47179932031"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -277,5 +277,113 @@
             "version": "==1.2.0"
         }
     },
-    "develop": {}
+    "develop": {
+        "black": {
+            "hashes": [
+                "sha256:250d7e60f323fcfc8ea6c800d5eba12f7967400eb6c2d21ae85ad31c204fb1f4",
+                "sha256:2a9acad1451632021ee0d146c8765782a0c3846e0e0ea46659d7c4f89d9b212b",
+                "sha256:412f56bab20ac85927f3a959230331de5614aecda1ede14b373083f62ec24e6f",
+                "sha256:421f3e44aa67138ab1b9bfbc22ee3780b22fa5b291e4db8ab7eee95200726b07",
+                "sha256:45aa1d4675964946e53ab81aeec7a37613c1cb71647b5394779e6efb79d6d187",
+                "sha256:4c44b7211a3a0570cc097e81135faa5f261264f4dfaa22bd5ee2875a4e773bd6",
+                "sha256:4c68855825ff432d197229846f971bc4d6666ce90492e5b02013bcaca4d9ab05",
+                "sha256:5133f5507007ba08d8b7b263c7aa0f931af5ba88a29beacc4b2dc23fcefe9c06",
+                "sha256:54caaa703227c6e0c87b76326d0862184729a69b73d3b7305b6288e1d830067e",
+                "sha256:58e5f4d08a205b11800332920e285bd25e1a75c54953e05502052738fe16b3b5",
+                "sha256:698c1e0d5c43354ec5d6f4d914d0d553a9ada56c85415700b81dc90125aac244",
+                "sha256:6c1cac07e64433f646a9a838cdc00c9768b3c362805afc3fce341af0e6a9ae9f",
+                "sha256:760415ccc20f9e8747084169110ef75d545f3b0932ee21368f63ac0fee86b221",
+                "sha256:7f622b6822f02bfaf2a5cd31fdb7cd86fcf33dab6ced5185c35f5db98260b055",
+                "sha256:cf57719e581cfd48c4efe28543fea3d139c6b6f1238b3f0102a9c73992cbb479",
+                "sha256:d136ef5b418c81660ad847efe0e55c58c8208b77a57a28a503a5f345ccf01394",
+                "sha256:dbea0bb8575c6b6303cc65017b46351dc5953eea5c0a59d7b7e3a2d2f433a911",
+                "sha256:fc7f6a44d52747e65a02558e1d807c82df1d66ffa80a601862040a43ec2e3142"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.8'",
+            "version": "==23.11.0"
+        },
+        "click": {
+            "hashes": [
+                "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28",
+                "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==8.1.7"
+        },
+        "flake8": {
+            "hashes": [
+                "sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23",
+                "sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5"
+            ],
+            "index": "pypi",
+            "markers": "python_full_version >= '3.8.1'",
+            "version": "==6.1.0"
+        },
+        "isort": {
+            "hashes": [
+                "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504",
+                "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6"
+            ],
+            "index": "pypi",
+            "markers": "python_full_version >= '3.8.0'",
+            "version": "==5.12.0"
+        },
+        "mccabe": {
+            "hashes": [
+                "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325",
+                "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==0.7.0"
+        },
+        "mypy-extensions": {
+            "hashes": [
+                "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d",
+                "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==1.0.0"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5",
+                "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==23.2"
+        },
+        "pathspec": {
+            "hashes": [
+                "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20",
+                "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==0.11.2"
+        },
+        "platformdirs": {
+            "hashes": [
+                "sha256:118c954d7e949b35437270383a3f2531e99dd93cf7ce4dc8340d3356d30f173b",
+                "sha256:cb633b2bcf10c51af60beb0ab06d2f1d69064b43abf4c185ca6b28865f3f9731"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==4.0.0"
+        },
+        "pycodestyle": {
+            "hashes": [
+                "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f",
+                "sha256:44fe31000b2d866f2e41841b18528a505fbd7fef9017b04eff4e2648a0fadc67"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==2.11.1"
+        },
+        "pyflakes": {
+            "hashes": [
+                "sha256:4132f6d49cb4dae6819e5379898f2b8cce3c5f23994194c24b77d5da2e36f774",
+                "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==3.1.0"
+        }
+    }
 }

From dc860ece70c8b90a92b7e8a5bea27a78b0d27359 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Fri, 1 Dec 2023 08:12:07 -0600
Subject: [PATCH 46/48] fixed hadolint errors in dockerfile

---
 scrapers/.hadolint.yaml | 2 ++
 scrapers/Dockerfile     | 8 +++++---
 2 files changed, 7 insertions(+), 3 deletions(-)
 create mode 100644 scrapers/.hadolint.yaml

diff --git a/scrapers/.hadolint.yaml b/scrapers/.hadolint.yaml
new file mode 100644
index 0000000..faf3736
--- /dev/null
+++ b/scrapers/.hadolint.yaml
@@ -0,0 +1,2 @@
+ignored:
+  - DL3033
\ No newline at end of file
diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile
index 39359a3..217ca23 100644
--- a/scrapers/Dockerfile
+++ b/scrapers/Dockerfile
@@ -4,20 +4,22 @@ RUN yum install -y unzip && \
     curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \
     unzip /tmp/chromedriver-linux64.zip -d /opt/ && \
     unzip /tmp/chrome-linux64.zip -d /opt/ && \
-    yup clean all
+    yum clean all
 
 FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733
 RUN yum install atk cups-libs gtk3 libXcomposite alsa-lib \
     libXcursor libXdamage libXext libXi libXrandr libXScrnSaver \
     libXtst pango at-spi2-atk libXt xorg-x11-server-Xvfb \
-    xorg-x11-xauth dbus-glib dbus-glib-devel -y
+    xorg-x11-xauth dbus-glib dbus-glib-devel -y && \
+    yum clean all
 COPY --from=build /opt/chrome-linux64 /opt/chrome
 COPY --from=build /opt/chromedriver-linux64 /opt/
 
+WORKDIR /var/task
 COPY scrapers.py ./
 COPY src ./src
 COPY requirements.txt ./
 
-RUN pip install -r requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
 
 CMD [ "scrapers.craigslist" ]
\ No newline at end of file

From d5a6170d0dd9d7932a9385b164e7151706d911dd Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Fri, 1 Dec 2023 08:22:34 -0600
Subject: [PATCH 47/48] Added latest versions of yum packages to dockerfile

---
 scrapers/.hadolint.yaml |  2 --
 scrapers/Dockerfile     | 10 +++++-----
 2 files changed, 5 insertions(+), 7 deletions(-)
 delete mode 100644 scrapers/.hadolint.yaml

diff --git a/scrapers/.hadolint.yaml b/scrapers/.hadolint.yaml
deleted file mode 100644
index faf3736..0000000
--- a/scrapers/.hadolint.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-ignored:
-  - DL3033
\ No newline at end of file
diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile
index 217ca23..844d302 100644
--- a/scrapers/Dockerfile
+++ b/scrapers/Dockerfile
@@ -1,5 +1,5 @@
 FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build
-RUN yum install -y unzip && \
+RUN yum install -y unzip-* && \
     curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \
     curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \
     unzip /tmp/chromedriver-linux64.zip -d /opt/ && \
@@ -7,10 +7,10 @@ RUN yum install -y unzip && \
     yum clean all
 
 FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733
-RUN yum install atk cups-libs gtk3 libXcomposite alsa-lib \
-    libXcursor libXdamage libXext libXi libXrandr libXScrnSaver \
-    libXtst pango at-spi2-atk libXt xorg-x11-server-Xvfb \
-    xorg-x11-xauth dbus-glib dbus-glib-devel -y && \
+RUN yum install atk-* cups-libs-* gtk3-* libXcomposite-* alsa-lib-* \
+    libXcursor-* libXdamage-* libXext-* libXi-* libXrandr-* libXScrnSaver-* \
+    libXtst-* pango-* at-spi2-atk-* libXt-* xorg-x11-server-Xvfb-* \
+    xorg-x11-xauth-* dbus-glib-* dbus-glib-devel-* -y && \
     yum clean all
 COPY --from=build /opt/chrome-linux64 /opt/chrome
 COPY --from=build /opt/chromedriver-linux64 /opt/

From 3a9e8c871a928313b966f2d8957a22ead8737965 Mon Sep 17 00:00:00 2001
From: waseem-polus <waseem_imad@yahoo.com>
Date: Fri, 1 Dec 2023 08:30:16 -0600
Subject: [PATCH 48/48] isort plz T-T

---
 scrapers/scrapers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
index 5d93a24..90cd562 100644
--- a/scrapers/scrapers.py
+++ b/scrapers/scrapers.py
@@ -1,7 +1,6 @@
 import re
 
 import typer
-
 from src import craigslist as cl
 from src import database as db
 from src import facebook as fb