-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #56 from lryanle/6/craigslist-scraper
6/craigslist scraper
- Loading branch information
Showing
17 changed files
with
1,025 additions
and
255 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,4 +130,7 @@ dist | |
.pnp.* | ||
|
||
# misc | ||
*.DS_STORE | ||
*.DS_STORE | ||
|
||
# python | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[flake8] | ||
max-line-length = 120 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build | ||
RUN yum install -y unzip-* && \ | ||
curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \ | ||
curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \ | ||
unzip /tmp/chromedriver-linux64.zip -d /opt/ && \ | ||
unzip /tmp/chrome-linux64.zip -d /opt/ && \ | ||
yum clean all | ||
|
||
FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 | ||
RUN yum install atk-* cups-libs-* gtk3-* libXcomposite-* alsa-lib-* \ | ||
libXcursor-* libXdamage-* libXext-* libXi-* libXrandr-* libXScrnSaver-* \ | ||
libXtst-* pango-* at-spi2-atk-* libXt-* xorg-x11-server-Xvfb-* \ | ||
xorg-x11-xauth-* dbus-glib-* dbus-glib-devel-* -y && \ | ||
yum clean all | ||
COPY --from=build /opt/chrome-linux64 /opt/chrome | ||
COPY --from=build /opt/chromedriver-linux64 /opt/ | ||
|
||
WORKDIR /var/task | ||
COPY scrapers.py ./ | ||
COPY src ./src | ||
COPY requirements.txt ./ | ||
|
||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
CMD [ "scrapers.craigslist" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
[[source]] | ||
url = "https://pypi.org/simple" | ||
verify_ssl = true | ||
name = "pypi" | ||
|
||
[scripts] | ||
build = "docker build --platform linux/amd64 -t smare ." | ||
cont = "docker run --name smarecontainer -d smare:latest" | ||
exec = "docker exec -it smarecontainer" | ||
craigslist = "pipenv run exec python3 -c 'import scrapers; scrapers.craigslist(\"\",\"\")'" | ||
facebook = "pipenv run exec python3 -c 'import scrapers; scrapers.facebook(\"\",\"\")'" | ||
|
||
[packages] | ||
selenium = "*" | ||
bs4 = "*" | ||
pymongo = "*" | ||
typer = "*" | ||
python-dotenv = "*" | ||
|
||
[dev-packages] | ||
isort = "*" | ||
black = "*" | ||
flake8 = "*" | ||
|
||
[requires] | ||
python_version = "3.11" |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
-i https://pypi.org/simple | ||
attrs==23.1.0; python_version >= '3.7' | ||
beautifulsoup4==4.12.2; python_full_version >= '3.6.0' | ||
bs4==0.0.1 | ||
certifi==2023.11.17; python_version >= '3.6' | ||
click==8.1.7; python_version >= '3.7' | ||
dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0' | ||
h11==0.14.0; python_version >= '3.7' | ||
idna==3.6; python_version >= '3.5' | ||
outcome==1.3.0.post0; python_version >= '3.7' | ||
pymongo==4.6.1; python_version >= '3.7' | ||
pysocks==1.7.1 | ||
python-dotenv==1.0.0; python_version >= '3.8' | ||
selenium==4.15.2; python_version >= '3.8' | ||
sniffio==1.3.0; python_version >= '3.7' | ||
sortedcontainers==2.4.0 | ||
soupsieve==2.5; python_version >= '3.8' | ||
trio==0.23.1; python_version >= '3.8' | ||
trio-websocket==0.11.1; python_version >= '3.7' | ||
typer==0.9.0; python_version >= '3.6' | ||
typing-extensions==4.8.0; python_version >= '3.8' | ||
urllib3[socks]==2.1.0; python_version >= '3.8' | ||
wsproto==1.2.0; python_full_version >= '3.7.0' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import re | ||
|
||
import typer | ||
from src import craigslist as cl | ||
from src import database as db | ||
from src import facebook as fb | ||
from src import utils | ||
|
||
app = typer.Typer() | ||
|
||
craigslistScraperVersion = 1 | ||
facebookScraperVersion = 1 | ||
|
||
|
||
@app.command() | ||
def craigslist(event, context): | ||
utils.scrape("craigslist", craigslistScraperVersion) | ||
|
||
|
||
@app.command() | ||
def facebook(event, context): | ||
utils.scrape("facebook", facebookScraperVersion) | ||
|
||
|
||
@app.command() | ||
def link(link: str): | ||
clPattern = re.compile( | ||
r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$" | ||
) | ||
fbPattern = re.compile( | ||
r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$" | ||
) | ||
|
||
if clPattern.match(link): | ||
newInfo = cl.scrapeListing(link) | ||
db.update(link, newInfo) | ||
elif fbPattern.match(link): | ||
newInfo = fb.scrapeListing(link) | ||
print(newInfo) | ||
else: | ||
print("Not a Craigslist nor a Facebook Marketplace link") | ||
|
||
|
||
if __name__ == "__main__": | ||
app() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
import time | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from . import utils | ||
|
||
|
||
def loadPageResources(driver): | ||
scroll = 100 | ||
|
||
print("Waiting to load...") | ||
time.sleep(2) | ||
|
||
utils.scrollTo(scroll, driver) | ||
|
||
loadImgButtons = driver.find_elements("class name", "slider-back-arrow") | ||
|
||
time.sleep(2) | ||
|
||
# Emulate a user scrolling | ||
for i in range(len(loadImgButtons)): | ||
scroll += 100 | ||
utils.scrollTo(scroll, driver) | ||
|
||
utils.clickOn(loadImgButtons[i], driver) | ||
|
||
time.sleep(0.5) | ||
|
||
|
||
def setupURLs(oldestAllowedCars): | ||
# List of TX cities to scrape; can be expanded | ||
cities = [ | ||
"abilene", | ||
"amarillo", | ||
"austin", | ||
"beaumont", | ||
"brownsville", | ||
"collegestation", | ||
"corpuschristi", | ||
"dallas", | ||
"nacogdoches", | ||
"delrio", | ||
"elpaso", | ||
"galveston", | ||
"houston", | ||
"killeen", | ||
"laredo", | ||
"lubbock", | ||
"mcallen", | ||
"odessa", | ||
"sanangelo", | ||
"sanantonio", | ||
"sanmarcos", | ||
"bigbend", | ||
"texoma", | ||
"easttexas", | ||
"victoriatx", | ||
"waco", | ||
"wichitafalls", | ||
] | ||
|
||
# Set the URL of the Facebook Marketplace automotive category | ||
base_url = ( | ||
"https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0" | ||
) | ||
return [base_url.format(city, oldestAllowedCars) for city in cities] | ||
|
||
|
||
def getAllPosts(browser): | ||
# Create a BeautifulSoup object from the HTML of the page | ||
html = browser.page_source | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
# Find all of the car listings on the page | ||
return soup.find_all("div", class_="gallery-card") | ||
|
||
|
||
def getCarInfo(post): | ||
title = post.find("span", class_="label").text | ||
|
||
print(f'Scraping "{title}"') | ||
|
||
price = post.find("span", class_="priceinfo").text | ||
metadata = post.find("div", class_="meta").text.split("·") | ||
|
||
odometer = metadata[1] | ||
if len(metadata) >= 3: | ||
location = metadata[2] | ||
|
||
link = post.find("a", class_="posting-title", href=True)["href"] | ||
|
||
imageElements = post.findAll("img") | ||
images = [img["src"] for img in imageElements] | ||
|
||
return title, price, location, odometer, link, images | ||
|
||
|
||
def processAttributes(attributes): | ||
processedAttributes = [] | ||
|
||
for attr in attributes: | ||
[label, value] = attr.split(": ") | ||
processedAttributes.append( | ||
{"label": label.replace(" ", "-").lower(), "value": value} | ||
) | ||
|
||
return processedAttributes | ||
|
||
|
||
def scrapeListing(url): | ||
browser = utils.setupBrowser() | ||
|
||
# Navigate to the URL | ||
print(f"Going to {url}") | ||
browser.get(url) | ||
|
||
print(f"Loading page for {url}") | ||
time.sleep(1) | ||
|
||
# Create a BeautifulSoup object from the HTML of the page | ||
html = browser.page_source | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
try: | ||
description = soup.find("section", id="postingbody").text | ||
attributes = processAttributes( | ||
[ | ||
attr.text | ||
for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span") | ||
] | ||
) | ||
map = soup.find("div", id="map") | ||
|
||
car = { | ||
"postBody": description, | ||
"longitude": map["data-longitude"], | ||
"latitude": map["data-latitude"], | ||
} | ||
|
||
for attr in attributes: | ||
car[attr["label"]] = attr["value"] | ||
|
||
return car | ||
except Exception as e: | ||
print(f"Failed scraping {url}: \n{e}") | ||
|
||
# Close the Selenium WebDriver instance | ||
browser.quit() |
Oops, something went wrong.
9c28455
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Successfully deployed to the following URLs:
seniordesign – ./
seniordesign-lryanle.vercel.app
smare.lryanle.com
seniordesign-git-main-lryanle.vercel.app
smare.vercel.app