Skip to content

Commit

Permalink
Merge pull request #56 from lryanle/6/craigslist-scraper
Browse files Browse the repository at this point in the history
6/craigslist scraper
  • Loading branch information
waseem-polus authored Dec 1, 2023
2 parents a7e8f4c + 3a9e8c8 commit 9c28455
Show file tree
Hide file tree
Showing 17 changed files with 1,025 additions and 255 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,7 @@ dist
.pnp.*

# misc
*.DS_STORE
*.DS_STORE

# python
*.pyc
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,36 @@ Make a copy of the ``.env.example`` file and make the following changes.
2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus)

3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable

## Run Scrapers locally
**Prerequisites**
- python3
- pipenv

**Installing dependencies**
Navigate to ``scrapers/`` and open the virtual environment using
```bash
pipenv shell
```
Then install dependencies using
```bash
pipenv install
```

**Scraper Usage**
To create build a Docker Image use
```bash
pipenv run build
```
to run a docker container "smarecontainer" use
```bash
pipenv run cont
```
then
```bash
# Scrape Craigsist homepage
pipenv run craigslist

# Scrape Facebook Marketplace homepage
pipenv run facebook
```
File renamed without changes.
2 changes: 2 additions & 0 deletions scrapers/.flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
max-line-length = 120
25 changes: 25 additions & 0 deletions scrapers/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build
RUN yum install -y unzip-* && \
curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \
curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \
unzip /tmp/chromedriver-linux64.zip -d /opt/ && \
unzip /tmp/chrome-linux64.zip -d /opt/ && \
yum clean all

FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733
RUN yum install atk-* cups-libs-* gtk3-* libXcomposite-* alsa-lib-* \
libXcursor-* libXdamage-* libXext-* libXi-* libXrandr-* libXScrnSaver-* \
libXtst-* pango-* at-spi2-atk-* libXt-* xorg-x11-server-Xvfb-* \
xorg-x11-xauth-* dbus-glib-* dbus-glib-devel-* -y && \
yum clean all
COPY --from=build /opt/chrome-linux64 /opt/chrome
COPY --from=build /opt/chromedriver-linux64 /opt/

WORKDIR /var/task
COPY scrapers.py ./
COPY src ./src
COPY requirements.txt ./

RUN pip install --no-cache-dir -r requirements.txt

CMD [ "scrapers.craigslist" ]
26 changes: 26 additions & 0 deletions scrapers/Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[scripts]
build = "docker build --platform linux/amd64 -t smare ."
cont = "docker run --name smarecontainer -d smare:latest"
exec = "docker exec -it smarecontainer"
craigslist = "pipenv run exec python3 -c 'import scrapers; scrapers.craigslist(\"\",\"\")'"
facebook = "pipenv run exec python3 -c 'import scrapers; scrapers.facebook(\"\",\"\")'"

[packages]
selenium = "*"
bs4 = "*"
pymongo = "*"
typer = "*"
python-dotenv = "*"

[dev-packages]
isort = "*"
black = "*"
flake8 = "*"

[requires]
python_version = "3.11"
389 changes: 389 additions & 0 deletions scrapers/Pipfile.lock

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions scrapers/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-i https://pypi.org/simple
attrs==23.1.0; python_version >= '3.7'
beautifulsoup4==4.12.2; python_full_version >= '3.6.0'
bs4==0.0.1
certifi==2023.11.17; python_version >= '3.6'
click==8.1.7; python_version >= '3.7'
dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0'
h11==0.14.0; python_version >= '3.7'
idna==3.6; python_version >= '3.5'
outcome==1.3.0.post0; python_version >= '3.7'
pymongo==4.6.1; python_version >= '3.7'
pysocks==1.7.1
python-dotenv==1.0.0; python_version >= '3.8'
selenium==4.15.2; python_version >= '3.8'
sniffio==1.3.0; python_version >= '3.7'
sortedcontainers==2.4.0
soupsieve==2.5; python_version >= '3.8'
trio==0.23.1; python_version >= '3.8'
trio-websocket==0.11.1; python_version >= '3.7'
typer==0.9.0; python_version >= '3.6'
typing-extensions==4.8.0; python_version >= '3.8'
urllib3[socks]==2.1.0; python_version >= '3.8'
wsproto==1.2.0; python_full_version >= '3.7.0'
45 changes: 45 additions & 0 deletions scrapers/scrapers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import re

import typer
from src import craigslist as cl
from src import database as db
from src import facebook as fb
from src import utils

app = typer.Typer()

craigslistScraperVersion = 1
facebookScraperVersion = 1


@app.command()
def craigslist(event, context):
utils.scrape("craigslist", craigslistScraperVersion)


@app.command()
def facebook(event, context):
utils.scrape("facebook", facebookScraperVersion)


@app.command()
def link(link: str):
clPattern = re.compile(
r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$"
)
fbPattern = re.compile(
r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$"
)

if clPattern.match(link):
newInfo = cl.scrapeListing(link)
db.update(link, newInfo)
elif fbPattern.match(link):
newInfo = fb.scrapeListing(link)
print(newInfo)
else:
print("Not a Craigslist nor a Facebook Marketplace link")


if __name__ == "__main__":
app()
Empty file added scrapers/src/__init__.py
Empty file.
148 changes: 148 additions & 0 deletions scrapers/src/craigslist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import time

from bs4 import BeautifulSoup

from . import utils


def loadPageResources(driver):
scroll = 100

print("Waiting to load...")
time.sleep(2)

utils.scrollTo(scroll, driver)

loadImgButtons = driver.find_elements("class name", "slider-back-arrow")

time.sleep(2)

# Emulate a user scrolling
for i in range(len(loadImgButtons)):
scroll += 100
utils.scrollTo(scroll, driver)

utils.clickOn(loadImgButtons[i], driver)

time.sleep(0.5)


def setupURLs(oldestAllowedCars):
# List of TX cities to scrape; can be expanded
cities = [
"abilene",
"amarillo",
"austin",
"beaumont",
"brownsville",
"collegestation",
"corpuschristi",
"dallas",
"nacogdoches",
"delrio",
"elpaso",
"galveston",
"houston",
"killeen",
"laredo",
"lubbock",
"mcallen",
"odessa",
"sanangelo",
"sanantonio",
"sanmarcos",
"bigbend",
"texoma",
"easttexas",
"victoriatx",
"waco",
"wichitafalls",
]

# Set the URL of the Facebook Marketplace automotive category
base_url = (
"https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0"
)
return [base_url.format(city, oldestAllowedCars) for city in cities]


def getAllPosts(browser):
# Create a BeautifulSoup object from the HTML of the page
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")

# Find all of the car listings on the page
return soup.find_all("div", class_="gallery-card")


def getCarInfo(post):
title = post.find("span", class_="label").text

print(f'Scraping "{title}"')

price = post.find("span", class_="priceinfo").text
metadata = post.find("div", class_="meta").text.split("·")

odometer = metadata[1]
if len(metadata) >= 3:
location = metadata[2]

link = post.find("a", class_="posting-title", href=True)["href"]

imageElements = post.findAll("img")
images = [img["src"] for img in imageElements]

return title, price, location, odometer, link, images


def processAttributes(attributes):
processedAttributes = []

for attr in attributes:
[label, value] = attr.split(": ")
processedAttributes.append(
{"label": label.replace(" ", "-").lower(), "value": value}
)

return processedAttributes


def scrapeListing(url):
browser = utils.setupBrowser()

# Navigate to the URL
print(f"Going to {url}")
browser.get(url)

print(f"Loading page for {url}")
time.sleep(1)

# Create a BeautifulSoup object from the HTML of the page
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")

try:
description = soup.find("section", id="postingbody").text
attributes = processAttributes(
[
attr.text
for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span")
]
)
map = soup.find("div", id="map")

car = {
"postBody": description,
"longitude": map["data-longitude"],
"latitude": map["data-latitude"],
}

for attr in attributes:
car[attr["label"]] = attr["value"]

return car
except Exception as e:
print(f"Failed scraping {url}: \n{e}")

# Close the Selenium WebDriver instance
browser.quit()
Loading

1 comment on commit 9c28455

@vercel
Copy link

@vercel vercel bot commented on 9c28455 Dec 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

seniordesign – ./

seniordesign-lryanle.vercel.app
smare.lryanle.com
seniordesign-git-main-lryanle.vercel.app
smare.vercel.app

Please sign in to comment.