diff --git a/.gitignore b/.gitignore
index 4763acb..08eebad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,4 +130,7 @@ dist
.pnp.*
# misc
-*.DS_STORE
\ No newline at end of file
+*.DS_STORE
+
+# python
+*.pyc
\ No newline at end of file
diff --git a/README.md b/README.md
index 2454dd3..2001e46 100644
--- a/README.md
+++ b/README.md
@@ -16,3 +16,36 @@ Make a copy of the ``.env.example`` file and make the following changes.
2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus)
3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable
+
+## Run Scrapers locally
+**Prerequisites**
+- python3
+- pipenv
+
+**Installing dependencies**
+Navigate to ``scrapers/`` and open the virtual environment using
+```bash
+pipenv shell
+```
+Then install dependencies using
+```bash
+pipenv install
+```
+
+**Scraper Usage**
+To create build a Docker Image use
+```bash
+pipenv run build
+```
+to run a docker container "smarecontainer" use
+```bash
+pipenv run cont
+```
+then
+```bash
+# Scrape Craigsist homepage
+pipenv run craigslist
+
+# Scrape Facebook Marketplace homepage
+pipenv run facebook
+```
diff --git a/github-metrics.svg b/github-metrics.svg
index a818099..0dfe375 100644
--- a/github-metrics.svg
+++ b/github-metrics.svg
@@ -25,31 +25,31 @@
- Deployed 148 times
+ Deployed 164 times
- 3.07 MB used
+ 3.47 MB used
-
-
-
+
+
+
-
+
-
+
-
+
@@ -132,13 +132,13 @@
-
-
-
-
-
-
-
+
+
+
+
+
+
+
@@ -152,7 +152,7 @@
-
4 merged
+
5 merged
@@ -196,7 +196,7 @@
- Updated less than 1 day ago
+ Updated 1 day ago
@@ -234,7 +234,7 @@
- Last updated 29 Nov 2023, 20:52:19 with lowlighter/metrics@3.34.0
+ Last updated 1 Dec 2023, 08:37:38 with lowlighter/metrics@3.34.0
diff --git a/metrics.plugin.screenshot.svg b/metrics.plugin.screenshot.svg
index 11d37c6..5d1461f 100644
--- a/metrics.plugin.screenshot.svg
+++ b/metrics.plugin.screenshot.svg
@@ -15,7 +15,7 @@
-
+
diff --git a/.env.example b/scrapers/.env.example
similarity index 100%
rename from .env.example
rename to scrapers/.env.example
diff --git a/scrapers/.flake8 b/scrapers/.flake8
new file mode 100644
index 0000000..79a16af
--- /dev/null
+++ b/scrapers/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 120
\ No newline at end of file
diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile
new file mode 100644
index 0000000..844d302
--- /dev/null
+++ b/scrapers/Dockerfile
@@ -0,0 +1,25 @@
+FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build
+RUN yum install -y unzip-* && \
+ curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \
+ curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \
+ unzip /tmp/chromedriver-linux64.zip -d /opt/ && \
+ unzip /tmp/chrome-linux64.zip -d /opt/ && \
+ yum clean all
+
+FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733
+RUN yum install atk-* cups-libs-* gtk3-* libXcomposite-* alsa-lib-* \
+ libXcursor-* libXdamage-* libXext-* libXi-* libXrandr-* libXScrnSaver-* \
+ libXtst-* pango-* at-spi2-atk-* libXt-* xorg-x11-server-Xvfb-* \
+ xorg-x11-xauth-* dbus-glib-* dbus-glib-devel-* -y && \
+ yum clean all
+COPY --from=build /opt/chrome-linux64 /opt/chrome
+COPY --from=build /opt/chromedriver-linux64 /opt/
+
+WORKDIR /var/task
+COPY scrapers.py ./
+COPY src ./src
+COPY requirements.txt ./
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+CMD [ "scrapers.craigslist" ]
\ No newline at end of file
diff --git a/scrapers/Pipfile b/scrapers/Pipfile
new file mode 100644
index 0000000..0ecb354
--- /dev/null
+++ b/scrapers/Pipfile
@@ -0,0 +1,26 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[scripts]
+build = "docker build --platform linux/amd64 -t smare ."
+cont = "docker run --name smarecontainer -d smare:latest"
+exec = "docker exec -it smarecontainer"
+craigslist = "pipenv run exec python3 -c 'import scrapers; scrapers.craigslist(\"\",\"\")'"
+facebook = "pipenv run exec python3 -c 'import scrapers; scrapers.facebook(\"\",\"\")'"
+
+[packages]
+selenium = "*"
+bs4 = "*"
+pymongo = "*"
+typer = "*"
+python-dotenv = "*"
+
+[dev-packages]
+isort = "*"
+black = "*"
+flake8 = "*"
+
+[requires]
+python_version = "3.11"
diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock
new file mode 100644
index 0000000..bb2797e
--- /dev/null
+++ b/scrapers/Pipfile.lock
@@ -0,0 +1,389 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "716098b2b29f4b98c932bd4554e3953a184fea6603a7d3f17e7bd47179932031"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "3.11"
+ },
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "attrs": {
+ "hashes": [
+ "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04",
+ "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==23.1.0"
+ },
+ "beautifulsoup4": {
+ "hashes": [
+ "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da",
+ "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"
+ ],
+ "markers": "python_full_version >= '3.6.0'",
+ "version": "==4.12.2"
+ },
+ "bs4": {
+ "hashes": [
+ "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
+ ],
+ "index": "pypi",
+ "version": "==0.0.1"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1",
+ "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2023.11.17"
+ },
+ "click": {
+ "hashes": [
+ "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28",
+ "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==8.1.7"
+ },
+ "dnspython": {
+ "hashes": [
+ "sha256:57c6fbaaeaaf39c891292012060beb141791735dbb4004798328fc2c467402d8",
+ "sha256:8dcfae8c7460a2f84b4072e26f1c9f4101ca20c071649cb7c34e8b6a93d58984"
+ ],
+ "markers": "python_version >= '3.8' and python_version < '4.0'",
+ "version": "==2.4.2"
+ },
+ "h11": {
+ "hashes": [
+ "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d",
+ "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.14.0"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca",
+ "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==3.6"
+ },
+ "outcome": {
+ "hashes": [
+ "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8",
+ "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==1.3.0.post0"
+ },
+ "pymongo": {
+ "hashes": [
+ "sha256:00c199e1c593e2c8b033136d7a08f0c376452bac8a896c923fcd6f419e07bdd2",
+ "sha256:010bc9aa90fd06e5cc52c8fac2c2fd4ef1b5f990d9638548dde178005770a5e8",
+ "sha256:026a24a36394dc8930cbcb1d19d5eb35205ef3c838a7e619e04bd170713972e7",
+ "sha256:061598cbc6abe2f382ab64c9caa83faa2f4c51256f732cdd890bcc6e63bfb67e",
+ "sha256:13552ca505366df74e3e2f0a4f27c363928f3dff0eef9f281eb81af7f29bc3c5",
+ "sha256:13d613c866f9f07d51180f9a7da54ef491d130f169e999c27e7633abe8619ec9",
+ "sha256:144a31391a39a390efce0c5ebcaf4bf112114af4384c90163f402cec5ede476b",
+ "sha256:1461199b07903fc1424709efafe379205bf5f738144b1a50a08b0396357b5abf",
+ "sha256:154b361dcb358ad377d5d40df41ee35f1cc14c8691b50511547c12404f89b5cb",
+ "sha256:1c5654bb8bb2bdb10e7a0bc3c193dd8b49a960b9eebc4381ff5a2043f4c3c441",
+ "sha256:1de3c6faf948f3edd4e738abdb4b76572b4f4fdfc1fed4dad02427e70c5a6219",
+ "sha256:1ed23b0e2dac6f84f44c8494fbceefe6eb5c35db5c1099f56ab78fc0d94ab3af",
+ "sha256:1f2b856518bfcfa316c8dae3d7b412aecacf2e8ba30b149f5eb3b63128d703b9",
+ "sha256:2346450a075625c4d6166b40a013b605a38b6b6168ce2232b192a37fb200d588",
+ "sha256:262356ea5fcb13d35fb2ab6009d3927bafb9504ef02339338634fffd8a9f1ae4",
+ "sha256:27b81ecf18031998ad7db53b960d1347f8f29e8b7cb5ea7b4394726468e4295e",
+ "sha256:2940aa20e9cc328e8ddeacea8b9a6f5ddafe0b087fedad928912e787c65b4909",
+ "sha256:2d4ccac3053b84a09251da8f5350bb684cbbf8c8c01eda6b5418417d0a8ab198",
+ "sha256:2dd2f6960ee3c9360bed7fb3c678be0ca2d00f877068556785ec2eb6b73d2414",
+ "sha256:3071ec998cc3d7b4944377e5f1217c2c44b811fae16f9a495c7a1ce9b42fb038",
+ "sha256:3094c7d2f820eecabadae76bfec02669567bbdd1730eabce10a5764778564f7b",
+ "sha256:30b2c9caf3e55c2e323565d1f3b7e7881ab87db16997dc0cbca7c52885ed2347",
+ "sha256:3177f783ae7e08aaf7b2802e0df4e4b13903520e8380915e6337cdc7a6ff01d8",
+ "sha256:31dab1f3e1d0cdd57e8df01b645f52d43cc1b653ed3afd535d2891f4fc4f9712",
+ "sha256:33bb16a07d3cc4e0aea37b242097cd5f7a156312012455c2fa8ca396953b11c4",
+ "sha256:349093675a2d3759e4fb42b596afffa2b2518c890492563d7905fac503b20daa",
+ "sha256:39d77d8bbb392fa443831e6d4ae534237b1f4eee6aa186f0cdb4e334ba89536e",
+ "sha256:3a7f02a58a0c2912734105e05dedbee4f7507e6f1bd132ebad520be0b11d46fd",
+ "sha256:3b287e814a01deddb59b88549c1e0c87cefacd798d4afc0c8bd6042d1c3d48aa",
+ "sha256:3c74f4725485f0a7a3862cfd374cc1b740cebe4c133e0c1425984bcdcce0f4bb",
+ "sha256:3cadf7f4c8e94d8a77874b54a63c80af01f4d48c4b669c8b6867f86a07ba994f",
+ "sha256:3d18a9b9b858ee140c15c5bfcb3e66e47e2a70a03272c2e72adda2482f76a6ad",
+ "sha256:3f0e6a6c807fa887a0c51cc24fe7ea51bb9e496fe88f00d7930063372c3664c3",
+ "sha256:4344c30025210b9fa80ec257b0e0aab5aa1d5cca91daa70d82ab97b482cc038e",
+ "sha256:4497d49d785482cc1a44a0ddf8830b036a468c088e72a05217f5b60a9e025012",
+ "sha256:547dc5d7f834b1deefda51aedb11a7af9c51c45e689e44e14aa85d44147c7657",
+ "sha256:5556e306713e2522e460287615d26c0af0fe5ed9d4f431dad35c6624c5d277e9",
+ "sha256:55dac73316e7e8c2616ba2e6f62b750918e9e0ae0b2053699d66ca27a7790105",
+ "sha256:56816e43c92c2fa8c11dc2a686f0ca248bea7902f4a067fa6cbc77853b0f041e",
+ "sha256:5bd94c503271e79917b27c6e77f7c5474da6930b3fb9e70a12e68c2dff386b9a",
+ "sha256:5ec31adc2e988fd7db3ab509954791bbc5a452a03c85e45b804b4bfc31fa221d",
+ "sha256:69247f7a2835fc0984bbf0892e6022e9a36aec70e187fcfe6cae6a373eb8c4de",
+ "sha256:6a0ae7a48a6ef82ceb98a366948874834b86c84e288dbd55600c1abfc3ac1d88",
+ "sha256:6a1810c2cbde714decf40f811d1edc0dae45506eb37298fd9d4247b8801509fe",
+ "sha256:76013fef1c9cd1cd00d55efde516c154aa169f2bf059b197c263a255ba8a9ddf",
+ "sha256:77e0df59b1a4994ad30c6d746992ae887f9756a43fc25dec2db515d94cf0222d",
+ "sha256:7bb0e9049e81def6829d09558ad12d16d0454c26cabe6efc3658e544460688d9",
+ "sha256:88beb444fb438385e53dc9110852910ec2a22f0eab7dd489e827038fdc19ed8d",
+ "sha256:8b47ebd89e69fbf33d1c2df79759d7162fc80c7652dacfec136dae1c9b3afac7",
+ "sha256:8d219b4508f71d762368caec1fc180960569766049bbc4d38174f05e8ef2fe5b",
+ "sha256:8ec75f35f62571a43e31e7bd11749d974c1b5cd5ea4a8388725d579263c0fdf6",
+ "sha256:9167e735379ec43d8eafa3fd675bfbb12e2c0464f98960586e9447d2cf2c7a83",
+ "sha256:9a710c184ba845afb05a6f876edac8f27783ba70e52d5eaf939f121fc13b2f59",
+ "sha256:9aafd036f6f2e5ad109aec92f8dbfcbe76cff16bad683eb6dd18013739c0b3ae",
+ "sha256:9c79d597fb3a7c93d7c26924db7497eba06d58f88f58e586aa69b2ad89fee0f8",
+ "sha256:a2831e05ce0a4df10c4ac5399ef50b9a621f90894c2a4d2945dc5658765514ed",
+ "sha256:a5e641f931c5cd95b376fd3c59db52770e17bec2bf86ef16cc83b3906c054845",
+ "sha256:b10d8cda9fc2fcdcfa4a000aa10413a2bf8b575852cd07cb8a595ed09689ca98",
+ "sha256:b435b13bb8e36be11b75f7384a34eefe487fe87a6267172964628e2b14ecf0a7",
+ "sha256:b7b1a83ce514700276a46af3d9e481ec381f05b64939effc9065afe18456a6b9",
+ "sha256:b8729dbf25eb32ad0dc0b9bd5e6a0d0b7e5c2dc8ec06ad171088e1896b522a74",
+ "sha256:bbed8cccebe1169d45cedf00461b2842652d476d2897fd1c42cf41b635d88746",
+ "sha256:c258dbacfff1224f13576147df16ce3c02024a0d792fd0323ac01bed5d3c545d",
+ "sha256:c30a9e06041fbd7a7590693ec5e407aa8737ad91912a1e70176aff92e5c99d20",
+ "sha256:c91ea3915425bd4111cb1b74511cdc56d1d16a683a48bf2a5a96b6a6c0f297f7",
+ "sha256:d0355cff58a4ed6d5e5f6b9c3693f52de0784aa0c17119394e2a8e376ce489d4",
+ "sha256:d483793a384c550c2d12cb794ede294d303b42beff75f3b3081f57196660edaf",
+ "sha256:d4c2be9760b112b1caf649b4977b81b69893d75aa86caf4f0f398447be871f3c",
+ "sha256:d8e62d06e90f60ea2a3d463ae51401475568b995bafaffd81767d208d84d7bb1",
+ "sha256:da08ea09eefa6b960c2dd9a68ec47949235485c623621eb1d6c02b46765322ac",
+ "sha256:dd1fa413f8b9ba30140de198e4f408ffbba6396864c7554e0867aa7363eb58b2",
+ "sha256:e2aced6fb2f5261b47d267cb40060b73b6527e64afe54f6497844c9affed5fd0",
+ "sha256:e438417ce1dc5b758742e12661d800482200b042d03512a8f31f6aaa9137ad40",
+ "sha256:e470fa4bace5f50076c32f4b3cc182b31303b4fefb9b87f990144515d572820b",
+ "sha256:eaf2f65190c506def2581219572b9c70b8250615dc918b3b7c218361a51ec42e",
+ "sha256:ef102a67ede70e1721fe27f75073b5314911dbb9bc27cde0a1c402a11531e7bd",
+ "sha256:ef801027629c5b511cf2ba13b9be29bfee36ae834b2d95d9877818479cdc99ea",
+ "sha256:f7acc03a4f1154ba2643edeb13658d08598fe6e490c3dd96a241b94f09801626",
+ "sha256:f9756f1d25454ba6a3c2f1ef8b7ddec23e5cdeae3dc3c3377243ae37a383db00",
+ "sha256:ff62ba8ff70f01ab4fe0ae36b2cb0b5d1f42e73dfc81ddf0758cd9f77331ad25",
+ "sha256:ff925f1cca42e933376d09ddc254598f8c5fcd36efc5cac0118bb36c36217c41"
+ ],
+ "index": "pypi",
+ "markers": "python_version >= '3.7'",
+ "version": "==4.6.1"
+ },
+ "pysocks": {
+ "hashes": [
+ "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299",
+ "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5",
+ "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"
+ ],
+ "version": "==1.7.1"
+ },
+ "python-dotenv": {
+ "hashes": [
+ "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba",
+ "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"
+ ],
+ "index": "pypi",
+ "markers": "python_version >= '3.8'",
+ "version": "==1.0.0"
+ },
+ "selenium": {
+ "hashes": [
+ "sha256:22eab5a1724c73d51b240a69ca702997b717eee4ba1f6065bf5d6b44dba01d48",
+ "sha256:9e82cd1ac647fb73cf0d4a6e280284102aaa3c9d94f0fa6e6cc4b5db6a30afbf"
+ ],
+ "index": "pypi",
+ "markers": "python_version >= '3.8'",
+ "version": "==4.15.2"
+ },
+ "sniffio": {
+ "hashes": [
+ "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101",
+ "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==1.3.0"
+ },
+ "sortedcontainers": {
+ "hashes": [
+ "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88",
+ "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"
+ ],
+ "version": "==2.4.0"
+ },
+ "soupsieve": {
+ "hashes": [
+ "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690",
+ "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"
+ ],
+ "markers": "python_version >= '3.8'",
+ "version": "==2.5"
+ },
+ "trio": {
+ "hashes": [
+ "sha256:16f89f7dcc8f7b9dcdec1fcd863e0c039af6d0f9a22f8dfd56f75d75ec73fd48",
+ "sha256:bb4abb3f4af23f96679e7c8cdabb8b234520f2498550d2cf63ebfd95f2ce27fe"
+ ],
+ "markers": "python_version >= '3.8'",
+ "version": "==0.23.1"
+ },
+ "trio-websocket": {
+ "hashes": [
+ "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f",
+ "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.11.1"
+ },
+ "typer": {
+ "hashes": [
+ "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2",
+ "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"
+ ],
+ "index": "pypi",
+ "markers": "python_version >= '3.6'",
+ "version": "==0.9.0"
+ },
+ "typing-extensions": {
+ "hashes": [
+ "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0",
+ "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"
+ ],
+ "markers": "python_version >= '3.8'",
+ "version": "==4.8.0"
+ },
+ "urllib3": {
+ "extras": [
+ "socks"
+ ],
+ "hashes": [
+ "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3",
+ "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54"
+ ],
+ "markers": "python_version >= '3.8'",
+ "version": "==2.1.0"
+ },
+ "wsproto": {
+ "hashes": [
+ "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065",
+ "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"
+ ],
+ "markers": "python_full_version >= '3.7.0'",
+ "version": "==1.2.0"
+ }
+ },
+ "develop": {
+ "black": {
+ "hashes": [
+ "sha256:250d7e60f323fcfc8ea6c800d5eba12f7967400eb6c2d21ae85ad31c204fb1f4",
+ "sha256:2a9acad1451632021ee0d146c8765782a0c3846e0e0ea46659d7c4f89d9b212b",
+ "sha256:412f56bab20ac85927f3a959230331de5614aecda1ede14b373083f62ec24e6f",
+ "sha256:421f3e44aa67138ab1b9bfbc22ee3780b22fa5b291e4db8ab7eee95200726b07",
+ "sha256:45aa1d4675964946e53ab81aeec7a37613c1cb71647b5394779e6efb79d6d187",
+ "sha256:4c44b7211a3a0570cc097e81135faa5f261264f4dfaa22bd5ee2875a4e773bd6",
+ "sha256:4c68855825ff432d197229846f971bc4d6666ce90492e5b02013bcaca4d9ab05",
+ "sha256:5133f5507007ba08d8b7b263c7aa0f931af5ba88a29beacc4b2dc23fcefe9c06",
+ "sha256:54caaa703227c6e0c87b76326d0862184729a69b73d3b7305b6288e1d830067e",
+ "sha256:58e5f4d08a205b11800332920e285bd25e1a75c54953e05502052738fe16b3b5",
+ "sha256:698c1e0d5c43354ec5d6f4d914d0d553a9ada56c85415700b81dc90125aac244",
+ "sha256:6c1cac07e64433f646a9a838cdc00c9768b3c362805afc3fce341af0e6a9ae9f",
+ "sha256:760415ccc20f9e8747084169110ef75d545f3b0932ee21368f63ac0fee86b221",
+ "sha256:7f622b6822f02bfaf2a5cd31fdb7cd86fcf33dab6ced5185c35f5db98260b055",
+ "sha256:cf57719e581cfd48c4efe28543fea3d139c6b6f1238b3f0102a9c73992cbb479",
+ "sha256:d136ef5b418c81660ad847efe0e55c58c8208b77a57a28a503a5f345ccf01394",
+ "sha256:dbea0bb8575c6b6303cc65017b46351dc5953eea5c0a59d7b7e3a2d2f433a911",
+ "sha256:fc7f6a44d52747e65a02558e1d807c82df1d66ffa80a601862040a43ec2e3142"
+ ],
+ "index": "pypi",
+ "markers": "python_version >= '3.8'",
+ "version": "==23.11.0"
+ },
+ "click": {
+ "hashes": [
+ "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28",
+ "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==8.1.7"
+ },
+ "flake8": {
+ "hashes": [
+ "sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23",
+ "sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5"
+ ],
+ "index": "pypi",
+ "markers": "python_full_version >= '3.8.1'",
+ "version": "==6.1.0"
+ },
+ "isort": {
+ "hashes": [
+ "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504",
+ "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6"
+ ],
+ "index": "pypi",
+ "markers": "python_full_version >= '3.8.0'",
+ "version": "==5.12.0"
+ },
+ "mccabe": {
+ "hashes": [
+ "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325",
+ "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.7.0"
+ },
+ "mypy-extensions": {
+ "hashes": [
+ "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d",
+ "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==1.0.0"
+ },
+ "packaging": {
+ "hashes": [
+ "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5",
+ "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==23.2"
+ },
+ "pathspec": {
+ "hashes": [
+ "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20",
+ "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.11.2"
+ },
+ "platformdirs": {
+ "hashes": [
+ "sha256:118c954d7e949b35437270383a3f2531e99dd93cf7ce4dc8340d3356d30f173b",
+ "sha256:cb633b2bcf10c51af60beb0ab06d2f1d69064b43abf4c185ca6b28865f3f9731"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==4.0.0"
+ },
+ "pycodestyle": {
+ "hashes": [
+ "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f",
+ "sha256:44fe31000b2d866f2e41841b18528a505fbd7fef9017b04eff4e2648a0fadc67"
+ ],
+ "markers": "python_version >= '3.8'",
+ "version": "==2.11.1"
+ },
+ "pyflakes": {
+ "hashes": [
+ "sha256:4132f6d49cb4dae6819e5379898f2b8cce3c5f23994194c24b77d5da2e36f774",
+ "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"
+ ],
+ "markers": "python_version >= '3.8'",
+ "version": "==3.1.0"
+ }
+ }
+}
diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt
new file mode 100644
index 0000000..e0430c4
--- /dev/null
+++ b/scrapers/requirements.txt
@@ -0,0 +1,23 @@
+-i https://pypi.org/simple
+attrs==23.1.0; python_version >= '3.7'
+beautifulsoup4==4.12.2; python_full_version >= '3.6.0'
+bs4==0.0.1
+certifi==2023.11.17; python_version >= '3.6'
+click==8.1.7; python_version >= '3.7'
+dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0'
+h11==0.14.0; python_version >= '3.7'
+idna==3.6; python_version >= '3.5'
+outcome==1.3.0.post0; python_version >= '3.7'
+pymongo==4.6.1; python_version >= '3.7'
+pysocks==1.7.1
+python-dotenv==1.0.0; python_version >= '3.8'
+selenium==4.15.2; python_version >= '3.8'
+sniffio==1.3.0; python_version >= '3.7'
+sortedcontainers==2.4.0
+soupsieve==2.5; python_version >= '3.8'
+trio==0.23.1; python_version >= '3.8'
+trio-websocket==0.11.1; python_version >= '3.7'
+typer==0.9.0; python_version >= '3.6'
+typing-extensions==4.8.0; python_version >= '3.8'
+urllib3[socks]==2.1.0; python_version >= '3.8'
+wsproto==1.2.0; python_full_version >= '3.7.0'
diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py
new file mode 100644
index 0000000..90cd562
--- /dev/null
+++ b/scrapers/scrapers.py
@@ -0,0 +1,45 @@
+import re
+
+import typer
+from src import craigslist as cl
+from src import database as db
+from src import facebook as fb
+from src import utils
+
+app = typer.Typer()
+
+craigslistScraperVersion = 1
+facebookScraperVersion = 1
+
+
+@app.command()
+def craigslist(event, context):
+ utils.scrape("craigslist", craigslistScraperVersion)
+
+
+@app.command()
+def facebook(event, context):
+ utils.scrape("facebook", facebookScraperVersion)
+
+
+@app.command()
+def link(link: str):
+ clPattern = re.compile(
+ r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$"
+ )
+ fbPattern = re.compile(
+ r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$"
+ )
+
+ if clPattern.match(link):
+ newInfo = cl.scrapeListing(link)
+ db.update(link, newInfo)
+ elif fbPattern.match(link):
+ newInfo = fb.scrapeListing(link)
+ print(newInfo)
+ else:
+ print("Not a Craigslist nor a Facebook Marketplace link")
+
+
+if __name__ == "__main__":
+ app()
diff --git a/scrapers/src/__init__.py b/scrapers/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py
new file mode 100644
index 0000000..13e30b9
--- /dev/null
+++ b/scrapers/src/craigslist.py
@@ -0,0 +1,148 @@
+import time
+
+from bs4 import BeautifulSoup
+
+from . import utils
+
+
+def loadPageResources(driver):
+ scroll = 100
+
+ print("Waiting to load...")
+ time.sleep(2)
+
+ utils.scrollTo(scroll, driver)
+
+ loadImgButtons = driver.find_elements("class name", "slider-back-arrow")
+
+ time.sleep(2)
+
+ # Emulate a user scrolling
+ for i in range(len(loadImgButtons)):
+ scroll += 100
+ utils.scrollTo(scroll, driver)
+
+ utils.clickOn(loadImgButtons[i], driver)
+
+ time.sleep(0.5)
+
+
+def setupURLs(oldestAllowedCars):
+ # List of TX cities to scrape; can be expanded
+ cities = [
+ "abilene",
+ "amarillo",
+ "austin",
+ "beaumont",
+ "brownsville",
+ "collegestation",
+ "corpuschristi",
+ "dallas",
+ "nacogdoches",
+ "delrio",
+ "elpaso",
+ "galveston",
+ "houston",
+ "killeen",
+ "laredo",
+ "lubbock",
+ "mcallen",
+ "odessa",
+ "sanangelo",
+ "sanantonio",
+ "sanmarcos",
+ "bigbend",
+ "texoma",
+ "easttexas",
+ "victoriatx",
+ "waco",
+ "wichitafalls",
+ ]
+
+ # Set the URL of the Facebook Marketplace automotive category
+ base_url = (
+ "https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0"
+ )
+ return [base_url.format(city, oldestAllowedCars) for city in cities]
+
+
+def getAllPosts(browser):
+ # Create a BeautifulSoup object from the HTML of the page
+ html = browser.page_source
+ soup = BeautifulSoup(html, "html.parser")
+
+ # Find all of the car listings on the page
+ return soup.find_all("div", class_="gallery-card")
+
+
+def getCarInfo(post):
+ title = post.find("span", class_="label").text
+
+ print(f'Scraping "{title}"')
+
+ price = post.find("span", class_="priceinfo").text
+ metadata = post.find("div", class_="meta").text.split("ยท")
+
+ odometer = metadata[1]
+ if len(metadata) >= 3:
+ location = metadata[2]
+
+ link = post.find("a", class_="posting-title", href=True)["href"]
+
+ imageElements = post.findAll("img")
+ images = [img["src"] for img in imageElements]
+
+ return title, price, location, odometer, link, images
+
+
+def processAttributes(attributes):
+ processedAttributes = []
+
+ for attr in attributes:
+ [label, value] = attr.split(": ")
+ processedAttributes.append(
+ {"label": label.replace(" ", "-").lower(), "value": value}
+ )
+
+ return processedAttributes
+
+
+def scrapeListing(url):
+ browser = utils.setupBrowser()
+
+ # Navigate to the URL
+ print(f"Going to {url}")
+ browser.get(url)
+
+ print(f"Loading page for {url}")
+ time.sleep(1)
+
+ # Create a BeautifulSoup object from the HTML of the page
+ html = browser.page_source
+ soup = BeautifulSoup(html, "html.parser")
+
+ try:
+ description = soup.find("section", id="postingbody").text
+ attributes = processAttributes(
+ [
+ attr.text
+ for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span")
+ ]
+ )
+ map = soup.find("div", id="map")
+
+ car = {
+ "postBody": description,
+ "longitude": map["data-longitude"],
+ "latitude": map["data-latitude"],
+ }
+
+ for attr in attributes:
+ car[attr["label"]] = attr["value"]
+
+ return car
+ except Exception as e:
+ print(f"Failed scraping {url}: \n{e}")
+
+ # Close the Selenium WebDriver instance
+ browser.quit()
diff --git a/scrapers/src/database.py b/scrapers/src/database.py
new file mode 100644
index 0000000..1c88567
--- /dev/null
+++ b/scrapers/src/database.py
@@ -0,0 +1,89 @@
+import os
+from datetime import date
+
+import pymongo
+from dotenv import load_dotenv
+
+db = "scrape"
+collection = "scraped_raw"
+
+
+def get_conn(db):
+ # load environment variable containing db uri (which includes username and password)
+ load_dotenv()
+ db_uri = os.environ.get("DB_URI")
+
+ # create a mongodb connection
+ try:
+ client = pymongo.MongoClient(db_uri)
+
+ # return a friendly error if a URI error is thrown
+ except pymongo.errors.ConfigurationError:
+ print(
+ "An Invalid URI host error was received."
+ " Is your Atlas host name correct in your connection string (found the .env)?"
+ )
+ return {"success": False, "db": 0}
+
+ return {"success": True, "db": client.get_database(db)}
+
+
+def post_raw(
+ scraperVersion,
+ source,
+ title,
+ price,
+ location,
+ miles,
+ link,
+ images=None,
+ postBody=None,
+ longitude=None,
+ latitude=None,
+ attributes=None,
+):
+ car = {
+ "_id": link,
+ "source": source,
+ "scraper-version": scraperVersion,
+ "scrape-date": str(date.today()),
+ "title": title,
+ "price": price,
+ "location": location,
+ "odometer": miles,
+ "link": link,
+ }
+
+ if images is not None:
+ car["images"] = images
+
+ if postBody is not None:
+ car["postBody"] = postBody
+
+ if longitude is not None:
+ car["longitude"] = longitude
+
+ if latitude is not None:
+ car["latitude"] = latitude
+
+ if attributes is not None:
+ for attr in attributes:
+ car[attr["label"]] = attr["value"]
+
+ # Insert into collection called "scrape_raw"
+ conn = get_conn(db)
+
+ if conn["success"]:
+ result = conn["db"][collection].insert_one(car)
+ return result.acknowledged
+ else:
+ return False
+
+
+def update(link, newFields):
+ conn = get_conn(db)
+ if conn["success"]:
+ result = conn["db"][collection].update_one({"_id": link}, {"$set": newFields})
+ return result.acknowledged
+ else:
+ return False
diff --git a/scrapers/src/facebook.py b/scrapers/src/facebook.py
new file mode 100644
index 0000000..61aef28
--- /dev/null
+++ b/scrapers/src/facebook.py
@@ -0,0 +1,156 @@
+import time
+
+from bs4 import BeautifulSoup
+
+from . import utils
+
+postClass = (
+ "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4"
+ " x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24"
+)
+linkClass = (
+ "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619"
+ "x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r"
+ " xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq"
+ " x1a2a7pz x1heor9g x1lku1pv"
+)
+thumbnailClass = "xt7dq6l xl1xv1r x6ikm8r x10wlt62 xh8yej3"
+titleClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6"
+priceClass = (
+ "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel"
+ " x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i"
+ " x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7"
+ " x1s688f xzsf02u"
+)
+metaClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft"
+
+listingInfoClass = "x78zum5 xdt5ytf x1iyjqo2 x1n2onr6"
+listingSectionClass = "xod5an3"
+bodyClass = (
+ "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx"
+ " x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty"
+ " x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u"
+)
+
+
+def loadPageResources(driver):
+ scroll = 100
+
+ print("Waiting to load...")
+ time.sleep(2)
+ utils.scrollTo(scroll, driver)
+ time.sleep(1.5)
+
+ # Emulate a user scrolling
+ for i in range(10):
+ scroll += 1000
+ utils.scrollTo(scroll, driver)
+ time.sleep(1)
+
+
+def setupURLs(oldestAllowedCars):
+ # List of TX cities to scrape; can be expanded
+ cities = ["houston", "dallas", "austin", "fortworth", "elpaso", "sanantonio"]
+
+ # Set the URL of the Facebook Marketplace automotive category
+ base_url = "https://www.facebook.com/marketplace/{}/vehicles?minYear={}&exact=false"
+ return [base_url.format(city, oldestAllowedCars) for city in cities]
+
+
+def getAllPosts(browser):
+ # Create a BeautifulSoup object from the HTML of the page
+ html = browser.page_source
+ soup = BeautifulSoup(html, "html.parser")
+
+ # Find all of the car listings on the page
+ return soup.find_all("div", class_=postClass)
+
+
+def getCarInfo(post):
+ title = post.find("span", class_=titleClass).text
+
+ print(f'Scraping "{title}"')
+
+ price = post.find("span", class_=priceClass).text
+ metadata = post.findAll("span", class_=metaClass)
+
+ location = metadata[0].text
+ odometer = metadata[1].text
+
+ link = post.find("a", class_=linkClass, href=True)["href"]
+ link = "https://facebook.com" + link
+
+ thumbnail = post.find("img", class_=thumbnailClass)["src"]
+
+ return title, price, location, odometer, link, [thumbnail]
+
+
+def getCarImages():
+ # class="x1a0syf3 x1ja2u2z"
+ return "TODO"
+
+
+def processAttributes(attributes):
+ processedAttributes = []
+
+ for attr in attributes:
+ [label, value] = attr.split(": ")
+ processedAttributes.append({"label": label, "value": value})
+
+ return processedAttributes
+
+
+def scrapeListing(url):
+ browser = utils.setupBrowser()
+
+ # Navigate to the URL
+ print(f"Going to {url[0:60]}")
+ browser.get(url[0:60])
+
+ print(f"Loading page for {url[0:60]}")
+ time.sleep(1)
+
+ # Create a BeautifulSoup object from the HTML of the page
+ html = browser.page_source
+ soup = BeautifulSoup(html, "html.parser")
+
+ try:
+ seeMoreButton = browser.find_element(
+ "class name",
+ "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x6prxxf xvq8zen x1s688f xzsf02u".replace(
+ " ", "."
+ ),
+ )
+ utils.clickOn(seeMoreButton, browser)
+
+ listingInfo = soup.find("div", class_=listingInfoClass)
+ # description = listingInfo.find(
+ # "span",
+ # class_=(
+ # "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq"
+ # " x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m"
+ # " x6prxxf xvq8zen xo1l8bm xzsf02u"
+ # ),
+ # )
+ print(listingInfo)
+
+ return 2
+
+ # attributes = processAttributes(
+ # [
+ # attr.text
+ # for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span")
+ # ]
+ # )
+
+ # map = soup.find('div', id='map')
+ # longitude = map["data-longitude"]
+ # latitude = map["data-latitude"]
+
+ # print([attributes, description, longitude, latitude])
+ except Exception as error:
+ print(error)
+ return -1
+
+ # Close the Selenium WebDriver instance
+ browser.quit()
diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py
new file mode 100644
index 0000000..399e64e
--- /dev/null
+++ b/scrapers/src/utils.py
@@ -0,0 +1,85 @@
+from selenium import webdriver
+
+from . import craigslist
+from . import database as db
+from . import facebook
+
+
+def scrollTo(x, driver):
+ driver.execute_script(
+ f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})"
+ )
+
+
+def clickOn(elem, driver):
+ driver.execute_script("arguments[0].click();", elem)
+
+
+def createDriverOptions():
+ options = webdriver.ChromeOptions()
+ options.binary_location = "/opt/chrome/chrome"
+
+ options.add_argument("--headless=new")
+ options.add_argument("--headless=new")
+ options.add_argument("--no-sandbox")
+ options.add_argument("--disable-gpu")
+ options.add_argument("--window-size=1280x1696")
+ options.add_argument("--single-process")
+ options.add_argument("--disable-dev-shm-usage")
+ options.add_argument("--disable-dev-tools")
+ options.add_argument("--no-zygote")
+
+ return options
+
+
+def setupBrowser():
+ print("Setting up headless browser")
+
+ service = webdriver.ChromeService("/opt/chromedriver")
+ options = createDriverOptions()
+
+ print("Creating a new Selenium WebDriver instance")
+ return webdriver.Chrome(options=options, service=service)
+
+
+def scrape(website, scraperVersion):
+ if website == "craigslist":
+ scraper = craigslist
+ elif website == "facebook":
+ scraper = facebook
+
+ cityURLs = scraper.setupURLs(2011)
+ browser = setupBrowser()
+
+ for url in cityURLs:
+ print(f"Going to {url}")
+ browser.get(url)
+
+ print(f"Loading cars from {url}")
+ scraper.loadPageResources(browser)
+
+ carPosts = scraper.getAllPosts(browser)
+
+ for post in carPosts:
+ try:
+ title, price, location, odometer, link, images = scraper.getCarInfo(
+ post
+ )
+ success = db.post_raw(
+ scraperVersion,
+ website,
+ title,
+ price,
+ location,
+ odometer,
+ link,
+ images,
+ )
+ if success:
+ print("posted to db")
+ else:
+ print("failed to post to db")
+ except Exception as error:
+ print(error)
+
+ browser.quit()
diff --git a/src/scrapers/craigslist.py b/src/scrapers/craigslist.py
deleted file mode 100644
index cb6ad22..0000000
--- a/src/scrapers/craigslist.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from datetime import datetime
-import csv
-import json
-import requests
-
-location_to_batch = {
- "newyork": "3-0-360-0-0",
- "philadelphia": "17-0-360-0-0",
- "dallas": "21-0-360-0-0",
- # Add more locations and their batch values as needed
-}
-
-def fetch_job_postings(location, category):
- base_url = "https://sapi.craigslist.org/web/v8/postings/search/full"
-
- # Get the batch value and category abbreviation from the mappings
- # Default to New York if location not found
- batch = location_to_batch.get(location)
-
- params = {
- 'batch': batch,
- 'cc': 'US',
- 'lang': 'en',
- 'searchPath': "cta",
- "id": "0",
- "collectContactInfo": True,
- }
-
- headers = {
- 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
- 'Referer': f'https://{location}.craigslist.org/',
- 'sec-ch-ua-mobile': '?0',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
- 'sec-ch-ua-platform': '"Windows"',
- 'Cookie': f'cl_b=COOKIE VALUE'
- }
-
- response = requests.get(base_url, params=params, headers=headers)
-
- if response.status_code == 200:
- data = response.json()
- else:
- print("Failed to retrieve data. Status code:", response.status_code)
- data = None
-
- job_postings = []
- with open('file.txt', 'w') as f:
- json.dump(data, f, indent=2)
-
- if data:
- for item in data["data"]["items"]:
- job_title = None
- commission = None
- for element in item:
- if isinstance(element, str):
- job_title = element
- elif isinstance(element, list) and len(element) > 0 and element[0] == 7:
- commission = element[1]
- if job_title and commission:
- job_postings.append((job_title, commission))
- return job_postings
-
- else:
- print("No data available.")
-
-if __name__ == "__main__":
- location = "dallas"
- category = "cta"
-
- job_postings = fetch_job_postings(location, category)
-
- if job_postings:
- current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
- category = category.replace("/", "&")
- csv_filename = f"{location}_{category}_openings_{current_datetime}.csv"
-
- with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
- writer = csv.writer(file)
-
- writer.writerow(["Job Title", "Commission"])
- for job in job_postings:
- writer.writerow([job[0], job[1]])
-
- print(f"Job postings have been saved to {csv_filename}")
- else:
- print("No data available.")
\ No newline at end of file
diff --git a/src/scrapers/database.py b/src/scrapers/database.py
deleted file mode 100644
index 49c463c..0000000
--- a/src/scrapers/database.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from dotenv import load_dotenv
-import pymongo
-import os
-from datetime import date
-
-def get_conn(db):
- # load environment variable containing db uri (which includes username and password)
- load_dotenv()
- db_uri = os.getenv("DB_URI")
-
- # create a mongodb connection
- try:
- client = pymongo.MongoClient(db_uri)
-
- # return a friendly error if a URI error is thrown
- except pymongo.errors.ConfigurationError:
- print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?")
- return {"success" : False, "db": 0}
-
- # use a database named "test"
- return {"success" : True, "db": client.get_database(db)}
-
-def post_raw(source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None):
- car = {
- "title": title,
- "price": price,
- "location": location,
- "odometer": miles,
- "link": link,
- "source": source,
- "scrapeDate": str(date.today())
- }
-
- if (images is not None):
- car["images"] = images
-
- if (postBody is not None):
- car["postBody"] = postBody
-
- if (longitude is not None):
- car["longitude"] = longitude
-
- if (latitude is not None):
- car["latitude"] = latitude
-
- if (attributes is not None):
- for attr in attributes:
- car[attr["label"]] = attr["value"]
-
- # Insert into collection called "scrape_test"
- conn = get_conn("scrape")
-
- if (conn["success"]):
- result = conn["db"]["scraped_raw"].insert_one(car)
- return result.acknowledged
- else:
- return False
\ No newline at end of file
diff --git a/src/scrapers/facebook.py b/src/scrapers/facebook.py
deleted file mode 100644
index 00b5e45..0000000
--- a/src/scrapers/facebook.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from selenium import webdriver
-from bs4 import BeautifulSoup
-from selenium.webdriver.chrome.options import Options
-import time
-
-import database
-
-
-#list of cities to scrape; can be expanded
-cities = [
- 'nyc', 'la', 'chicago', 'houston', 'miami',
- 'philadelphia', 'phoenix', 'sanantonio', 'sandiego', 'dallas',
- 'sanjose', 'austin', 'jacksonville', 'fortworth', 'columbus',
- 'charlotte', 'sanfrancisco', 'indianapolis', 'seattle', 'denver',
- 'washington', 'boston', 'elpaso', 'nashville', 'detroit', 'portland', 'lasvegas', 'memphis', 'louisville',
- 'baltimore', 'milwaukee', 'albuquerque', 'tucson', 'fresno',
- 'kansascity', 'mesa', 'atlanta',
- 'coloradosprings', 'virginiabeach', 'raleigh', 'omaha', 'miami',
- 'oakland', 'minneapolis', 'tulsa', 'wichita', 'neworleans'
-]
-
-# Set the URL of the Facebook Marketplace automotive category
-base_url = 'https://www.facebook.com/marketplace/{}/vehicles'
-urls = [base_url.format(city) for city in cities]
-
-# Create a new Selenium WebDriver instance
-
-print("Setting up headless browser")
-options = Options()
-options.add_argument("--headless=new")
-
-print("Creating a new Selenium WebDriver instance")
-driver = webdriver.Chrome(options=options)
-
-# Create a list to store the scraped data
-print("Started scraping...")
-data = {}
-for url in urls:
- # Navigate to the URL
- print(f"Navigating to {url}")
- driver.get(url)
-
- print(f"Loading {url}")
-
- time.sleep(2)
- scroll = 2000
-
- # Wait for the page to load
- time.sleep(2)
-
- for i in range(50):
- driver.execute_script(f"window.scrollTo(1, {scroll})")
- scroll += 1000
- time.sleep(.5)
-
- # Get the HTML of the page
- html = driver.page_source
-
- # Create a BeautifulSoup object from the HTML
- soup = BeautifulSoup(html, 'html.parser')
-
- # Find all of the automotive listings on the page
- car_posts = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24')
-
- # Iterate over the listings and scrape the data
- for post in car_posts:
- print("Scraping new listing")
- try:
- # Get the title of the listing
- title = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text
- except AttributeError:
- title = 'N/A' # Handle missing title
-
- try:
- # Get the price of the listing
- price = post.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text
- except AttributeError:
- price = 'N/A' # Handle missing price
-
- try:
- # Get the location of the listing
- location = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text
- except AttributeError:
- location = 'N/A' # Handle missing location
-
- try:
- # Get the miles of the car
- miles = post.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text
- except (AttributeError, IndexError):
- miles = 'N/A' # Handle missing miles
-
- try:
- # Get the link to the listing
- link = 'https://www.facebook.com' + post.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href']
- except (AttributeError, TypeError):
- link = 'N/A' # Handle missing link
-
- # Add the data to the list
- if (title, price, location, miles, link) not in data:
- data[(title, price, location, miles, link)] = True
- postSuccess = database.post_raw("facebook", title, price, location, miles, link)
- if (postSuccess):
- print("Save to DB")
- else:
- print("Failed to save to DB")
- else:
- print("Listing is a duplicate")
-
-
-# Close the Selenium WebDriver instance
-driver.quit()
\ No newline at end of file