Merge pull request #56 from lryanle/6/craigslist-scraper

6/craigslist scraper
lryanle · Dec 1, 2023 · 9c28455 · 9c28455 · vercel · Dec 1, 2023
2 parents a7e8f4c + 3a9e8c8
commit 9c28455
Show file tree

Hide file tree

Showing 17 changed files with 1,025 additions and 255 deletions.
diff --git a/.gitignore b/.gitignore
@@ -130,4 +130,7 @@ dist
 .pnp.*
 
 # misc
-*.DS_STORE
+*.DS_STORE
+
+# python
+*.pyc
diff --git a/README.md b/README.md
@@ -16,3 +16,36 @@ Make a copy of the ``.env.example`` file and make the following changes.
 2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus)
 
 3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable
+
+## Run Scrapers locally
+**Prerequisites**
+- python3
+- pipenv
+
+**Installing dependencies**  
+Navigate to ``scrapers/`` and open the virtual environment using
+```bash
+pipenv shell
+```
+Then install dependencies using
+```bash
+pipenv install
+```
+
+**Scraper Usage**  
+To create build a Docker Image use
+```bash
+pipenv run build
+```
+to run a docker container "smarecontainer" use
+```bash
+pipenv run cont
+```
+then
+```bash
+# Scrape Craigsist homepage
+pipenv run craigslist
+
+# Scrape Facebook Marketplace homepage
+pipenv run facebook
+```
diff --git a/.env.example → scrapers/.env.example b/.env.example → scrapers/.env.example
diff --git a/scrapers/.flake8 b/scrapers/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 120
diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile
@@ -0,0 +1,25 @@
+FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build
+RUN yum install -y unzip-* && \
+    curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \
+    curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \
+    unzip /tmp/chromedriver-linux64.zip -d /opt/ && \
+    unzip /tmp/chrome-linux64.zip -d /opt/ && \
+    yum clean all
+
+FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733
+RUN yum install atk-* cups-libs-* gtk3-* libXcomposite-* alsa-lib-* \
+    libXcursor-* libXdamage-* libXext-* libXi-* libXrandr-* libXScrnSaver-* \
+    libXtst-* pango-* at-spi2-atk-* libXt-* xorg-x11-server-Xvfb-* \
+    xorg-x11-xauth-* dbus-glib-* dbus-glib-devel-* -y && \
+    yum clean all
+COPY --from=build /opt/chrome-linux64 /opt/chrome
+COPY --from=build /opt/chromedriver-linux64 /opt/
+
+WORKDIR /var/task
+COPY scrapers.py ./
+COPY src ./src
+COPY requirements.txt ./
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+CMD [ "scrapers.craigslist" ]
diff --git a/scrapers/Pipfile b/scrapers/Pipfile
@@ -0,0 +1,26 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[scripts]
+build = "docker build --platform linux/amd64 -t smare ."
+cont = "docker run --name smarecontainer -d smare:latest"
+exec = "docker exec -it smarecontainer"
+craigslist = "pipenv run exec python3 -c 'import scrapers; scrapers.craigslist(\"\",\"\")'"
+facebook = "pipenv run exec python3 -c 'import scrapers; scrapers.facebook(\"\",\"\")'"
+
+[packages]
+selenium = "*"
+bs4 = "*"
+pymongo = "*"
+typer = "*"
+python-dotenv = "*"
+
+[dev-packages]
+isort = "*"
+black = "*"
+flake8 = "*"
+
+[requires]
+python_version = "3.11"
diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock
diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt
@@ -0,0 +1,23 @@
+-i https://pypi.org/simple
+attrs==23.1.0; python_version >= '3.7'
+beautifulsoup4==4.12.2; python_full_version >= '3.6.0'
+bs4==0.0.1
+certifi==2023.11.17; python_version >= '3.6'
+click==8.1.7; python_version >= '3.7'
+dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0'
+h11==0.14.0; python_version >= '3.7'
+idna==3.6; python_version >= '3.5'
+outcome==1.3.0.post0; python_version >= '3.7'
+pymongo==4.6.1; python_version >= '3.7'
+pysocks==1.7.1
+python-dotenv==1.0.0; python_version >= '3.8'
+selenium==4.15.2; python_version >= '3.8'
+sniffio==1.3.0; python_version >= '3.7'
+sortedcontainers==2.4.0
+soupsieve==2.5; python_version >= '3.8'
+trio==0.23.1; python_version >= '3.8'
+trio-websocket==0.11.1; python_version >= '3.7'
+typer==0.9.0; python_version >= '3.6'
+typing-extensions==4.8.0; python_version >= '3.8'
+urllib3[socks]==2.1.0; python_version >= '3.8'
+wsproto==1.2.0; python_full_version >= '3.7.0'
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
@@ -0,0 +1,45 @@
+import re
+
+import typer
+from src import craigslist as cl
+from src import database as db
+from src import facebook as fb
+from src import utils
+
+app = typer.Typer()
+
+craigslistScraperVersion = 1
+facebookScraperVersion = 1
+
+
+@app.command()
+def craigslist(event, context):
+    utils.scrape("craigslist", craigslistScraperVersion)
+
+
+@app.command()
+def facebook(event, context):
+    utils.scrape("facebook", facebookScraperVersion)
+
+
+@app.command()
+def link(link: str):
+    clPattern = re.compile(
+        r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$"
+    )
+    fbPattern = re.compile(
+        r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$"
+    )
+
+    if clPattern.match(link):
+        newInfo = cl.scrapeListing(link)
+        db.update(link, newInfo)
+    elif fbPattern.match(link):
+        newInfo = fb.scrapeListing(link)
+        print(newInfo)
+    else:
+        print("Not a Craigslist nor a Facebook Marketplace link")
+
+
+if __name__ == "__main__":
+    app()
diff --git a/scrapers/src/__init__.py b/scrapers/src/__init__.py
diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py
@@ -0,0 +1,148 @@
+import time
+
+from bs4 import BeautifulSoup
+
+from . import utils
+
+
+def loadPageResources(driver):
+    scroll = 100
+
+    print("Waiting to load...")
+    time.sleep(2)
+
+    utils.scrollTo(scroll, driver)
+
+    loadImgButtons = driver.find_elements("class name", "slider-back-arrow")
+
+    time.sleep(2)
+
+    # Emulate a user scrolling
+    for i in range(len(loadImgButtons)):
+        scroll += 100
+        utils.scrollTo(scroll, driver)
+
+        utils.clickOn(loadImgButtons[i], driver)
+
+        time.sleep(0.5)
+
+
+def setupURLs(oldestAllowedCars):
+    # List of TX cities to scrape; can be expanded
+    cities = [
+        "abilene",
+        "amarillo",
+        "austin",
+        "beaumont",
+        "brownsville",
+        "collegestation",
+        "corpuschristi",
+        "dallas",
+        "nacogdoches",
+        "delrio",
+        "elpaso",
+        "galveston",
+        "houston",
+        "killeen",
+        "laredo",
+        "lubbock",
+        "mcallen",
+        "odessa",
+        "sanangelo",
+        "sanantonio",
+        "sanmarcos",
+        "bigbend",
+        "texoma",
+        "easttexas",
+        "victoriatx",
+        "waco",
+        "wichitafalls",
+    ]
+
+    # Set the URL of the Facebook Marketplace automotive category
+    base_url = (
+        "https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0"
+    )
+    return [base_url.format(city, oldestAllowedCars) for city in cities]
+
+
+def getAllPosts(browser):
+    # Create a BeautifulSoup object from the HTML of the page
+    html = browser.page_source
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Find all of the car listings on the page
+    return soup.find_all("div", class_="gallery-card")
+
+
+def getCarInfo(post):
+    title = post.find("span", class_="label").text
+
+    print(f'Scraping "{title}"')
+
+    price = post.find("span", class_="priceinfo").text
+    metadata = post.find("div", class_="meta").text.split("·")
+
+    odometer = metadata[1]
+    if len(metadata) >= 3:
+        location = metadata[2]
+
+    link = post.find("a", class_="posting-title", href=True)["href"]
+
+    imageElements = post.findAll("img")
+    images = [img["src"] for img in imageElements]
+
+    return title, price, location, odometer, link, images
+
+
+def processAttributes(attributes):
+    processedAttributes = []
+
+    for attr in attributes:
+        [label, value] = attr.split(": ")
+        processedAttributes.append(
+            {"label": label.replace(" ", "-").lower(), "value": value}
+        )
+
+    return processedAttributes
+
+
+def scrapeListing(url):
+    browser = utils.setupBrowser()
+
+    # Navigate to the URL
+    print(f"Going to {url}")
+    browser.get(url)
+
+    print(f"Loading page for {url}")
+    time.sleep(1)
+
+    # Create a BeautifulSoup object from the HTML of the page
+    html = browser.page_source
+    soup = BeautifulSoup(html, "html.parser")
+
+    try:
+        description = soup.find("section", id="postingbody").text
+        attributes = processAttributes(
+            [
+                attr.text
+                for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span")
+            ]
+        )
+        map = soup.find("div", id="map")
+
+        car = {
+            "postBody": description,
+            "longitude": map["data-longitude"],
+            "latitude": map["data-latitude"],
+        }
+
+        for attr in attributes:
+            car[attr["label"]] = attr["value"]
+
+        return car
+    except Exception as e:
+        print(f"Failed scraping {url}: \n{e}")
+
+    # Close the Selenium WebDriver instance
+    browser.quit()