diff --git a/.gitignore b/.gitignore index 4763acb..08eebad 100644 --- a/.gitignore +++ b/.gitignore @@ -130,4 +130,7 @@ dist .pnp.* # misc -*.DS_STORE \ No newline at end of file +*.DS_STORE + +# python +*.pyc \ No newline at end of file diff --git a/README.md b/README.md index 2454dd3..2001e46 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,36 @@ Make a copy of the ``.env.example`` file and make the following changes. 2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus) 3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable + +## Run Scrapers locally +**Prerequisites** +- python3 +- pipenv + +**Installing dependencies** +Navigate to ``scrapers/`` and open the virtual environment using +```bash +pipenv shell +``` +Then install dependencies using +```bash +pipenv install +``` + +**Scraper Usage** +To create build a Docker Image use +```bash +pipenv run build +``` +to run a docker container "smarecontainer" use +```bash +pipenv run cont +``` +then +```bash +# Scrape Craigsist homepage +pipenv run craigslist + +# Scrape Facebook Marketplace homepage +pipenv run facebook +``` diff --git a/.env.example b/scrapers/.env.example similarity index 100% rename from .env.example rename to scrapers/.env.example diff --git a/scrapers/.flake8 b/scrapers/.flake8 new file mode 100644 index 0000000..79a16af --- /dev/null +++ b/scrapers/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 \ No newline at end of file diff --git a/scrapers/Dockerfile b/scrapers/Dockerfile new file mode 100644 index 0000000..844d302 --- /dev/null +++ b/scrapers/Dockerfile @@ -0,0 +1,25 @@ +FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 as build +RUN yum install -y unzip-* && \ + curl -Lo "/tmp/chromedriver-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip" && \ + curl -Lo "/tmp/chrome-linux64.zip" "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chrome-linux64.zip" && \ + unzip /tmp/chromedriver-linux64.zip -d /opt/ && \ + unzip /tmp/chrome-linux64.zip -d /opt/ && \ + yum clean all + +FROM public.ecr.aws/lambda/python@sha256:f0c3116a56d167eba8021a5d7c595f969835fbe78826303326f80de00d044733 +RUN yum install atk-* cups-libs-* gtk3-* libXcomposite-* alsa-lib-* \ + libXcursor-* libXdamage-* libXext-* libXi-* libXrandr-* libXScrnSaver-* \ + libXtst-* pango-* at-spi2-atk-* libXt-* xorg-x11-server-Xvfb-* \ + xorg-x11-xauth-* dbus-glib-* dbus-glib-devel-* -y && \ + yum clean all +COPY --from=build /opt/chrome-linux64 /opt/chrome +COPY --from=build /opt/chromedriver-linux64 /opt/ + +WORKDIR /var/task +COPY scrapers.py ./ +COPY src ./src +COPY requirements.txt ./ + +RUN pip install --no-cache-dir -r requirements.txt + +CMD [ "scrapers.craigslist" ] \ No newline at end of file diff --git a/scrapers/Pipfile b/scrapers/Pipfile new file mode 100644 index 0000000..0ecb354 --- /dev/null +++ b/scrapers/Pipfile @@ -0,0 +1,26 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[scripts] +build = "docker build --platform linux/amd64 -t smare ." +cont = "docker run --name smarecontainer -d smare:latest" +exec = "docker exec -it smarecontainer" +craigslist = "pipenv run exec python3 -c 'import scrapers; scrapers.craigslist(\"\",\"\")'" +facebook = "pipenv run exec python3 -c 'import scrapers; scrapers.facebook(\"\",\"\")'" + +[packages] +selenium = "*" +bs4 = "*" +pymongo = "*" +typer = "*" +python-dotenv = "*" + +[dev-packages] +isort = "*" +black = "*" +flake8 = "*" + +[requires] +python_version = "3.11" diff --git a/scrapers/Pipfile.lock b/scrapers/Pipfile.lock new file mode 100644 index 0000000..bb2797e --- /dev/null +++ b/scrapers/Pipfile.lock @@ -0,0 +1,389 @@ +{ + "_meta": { + "hash": { + "sha256": "716098b2b29f4b98c932bd4554e3953a184fea6603a7d3f17e7bd47179932031" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.11" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "attrs": { + "hashes": [ + "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04", + "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015" + ], + "markers": "python_version >= '3.7'", + "version": "==23.1.0" + }, + "beautifulsoup4": { + "hashes": [ + "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da", + "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==4.12.2" + }, + "bs4": { + "hashes": [ + "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" + ], + "index": "pypi", + "version": "==0.0.1" + }, + "certifi": { + "hashes": [ + "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1", + "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474" + ], + "markers": "python_version >= '3.6'", + "version": "==2023.11.17" + }, + "click": { + "hashes": [ + "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", + "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de" + ], + "markers": "python_version >= '3.7'", + "version": "==8.1.7" + }, + "dnspython": { + "hashes": [ + "sha256:57c6fbaaeaaf39c891292012060beb141791735dbb4004798328fc2c467402d8", + "sha256:8dcfae8c7460a2f84b4072e26f1c9f4101ca20c071649cb7c34e8b6a93d58984" + ], + "markers": "python_version >= '3.8' and python_version < '4.0'", + "version": "==2.4.2" + }, + "h11": { + "hashes": [ + "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", + "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761" + ], + "markers": "python_version >= '3.7'", + "version": "==0.14.0" + }, + "idna": { + "hashes": [ + "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca", + "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f" + ], + "markers": "python_version >= '3.5'", + "version": "==3.6" + }, + "outcome": { + "hashes": [ + "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", + "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b" + ], + "markers": "python_version >= '3.7'", + "version": "==1.3.0.post0" + }, + "pymongo": { + "hashes": [ + "sha256:00c199e1c593e2c8b033136d7a08f0c376452bac8a896c923fcd6f419e07bdd2", + "sha256:010bc9aa90fd06e5cc52c8fac2c2fd4ef1b5f990d9638548dde178005770a5e8", + "sha256:026a24a36394dc8930cbcb1d19d5eb35205ef3c838a7e619e04bd170713972e7", + "sha256:061598cbc6abe2f382ab64c9caa83faa2f4c51256f732cdd890bcc6e63bfb67e", + "sha256:13552ca505366df74e3e2f0a4f27c363928f3dff0eef9f281eb81af7f29bc3c5", + "sha256:13d613c866f9f07d51180f9a7da54ef491d130f169e999c27e7633abe8619ec9", + "sha256:144a31391a39a390efce0c5ebcaf4bf112114af4384c90163f402cec5ede476b", + "sha256:1461199b07903fc1424709efafe379205bf5f738144b1a50a08b0396357b5abf", + "sha256:154b361dcb358ad377d5d40df41ee35f1cc14c8691b50511547c12404f89b5cb", + "sha256:1c5654bb8bb2bdb10e7a0bc3c193dd8b49a960b9eebc4381ff5a2043f4c3c441", + "sha256:1de3c6faf948f3edd4e738abdb4b76572b4f4fdfc1fed4dad02427e70c5a6219", + "sha256:1ed23b0e2dac6f84f44c8494fbceefe6eb5c35db5c1099f56ab78fc0d94ab3af", + "sha256:1f2b856518bfcfa316c8dae3d7b412aecacf2e8ba30b149f5eb3b63128d703b9", + "sha256:2346450a075625c4d6166b40a013b605a38b6b6168ce2232b192a37fb200d588", + "sha256:262356ea5fcb13d35fb2ab6009d3927bafb9504ef02339338634fffd8a9f1ae4", + "sha256:27b81ecf18031998ad7db53b960d1347f8f29e8b7cb5ea7b4394726468e4295e", + "sha256:2940aa20e9cc328e8ddeacea8b9a6f5ddafe0b087fedad928912e787c65b4909", + "sha256:2d4ccac3053b84a09251da8f5350bb684cbbf8c8c01eda6b5418417d0a8ab198", + "sha256:2dd2f6960ee3c9360bed7fb3c678be0ca2d00f877068556785ec2eb6b73d2414", + "sha256:3071ec998cc3d7b4944377e5f1217c2c44b811fae16f9a495c7a1ce9b42fb038", + "sha256:3094c7d2f820eecabadae76bfec02669567bbdd1730eabce10a5764778564f7b", + "sha256:30b2c9caf3e55c2e323565d1f3b7e7881ab87db16997dc0cbca7c52885ed2347", + "sha256:3177f783ae7e08aaf7b2802e0df4e4b13903520e8380915e6337cdc7a6ff01d8", + "sha256:31dab1f3e1d0cdd57e8df01b645f52d43cc1b653ed3afd535d2891f4fc4f9712", + "sha256:33bb16a07d3cc4e0aea37b242097cd5f7a156312012455c2fa8ca396953b11c4", + "sha256:349093675a2d3759e4fb42b596afffa2b2518c890492563d7905fac503b20daa", + "sha256:39d77d8bbb392fa443831e6d4ae534237b1f4eee6aa186f0cdb4e334ba89536e", + "sha256:3a7f02a58a0c2912734105e05dedbee4f7507e6f1bd132ebad520be0b11d46fd", + "sha256:3b287e814a01deddb59b88549c1e0c87cefacd798d4afc0c8bd6042d1c3d48aa", + "sha256:3c74f4725485f0a7a3862cfd374cc1b740cebe4c133e0c1425984bcdcce0f4bb", + "sha256:3cadf7f4c8e94d8a77874b54a63c80af01f4d48c4b669c8b6867f86a07ba994f", + "sha256:3d18a9b9b858ee140c15c5bfcb3e66e47e2a70a03272c2e72adda2482f76a6ad", + "sha256:3f0e6a6c807fa887a0c51cc24fe7ea51bb9e496fe88f00d7930063372c3664c3", + "sha256:4344c30025210b9fa80ec257b0e0aab5aa1d5cca91daa70d82ab97b482cc038e", + "sha256:4497d49d785482cc1a44a0ddf8830b036a468c088e72a05217f5b60a9e025012", + "sha256:547dc5d7f834b1deefda51aedb11a7af9c51c45e689e44e14aa85d44147c7657", + "sha256:5556e306713e2522e460287615d26c0af0fe5ed9d4f431dad35c6624c5d277e9", + "sha256:55dac73316e7e8c2616ba2e6f62b750918e9e0ae0b2053699d66ca27a7790105", + "sha256:56816e43c92c2fa8c11dc2a686f0ca248bea7902f4a067fa6cbc77853b0f041e", + "sha256:5bd94c503271e79917b27c6e77f7c5474da6930b3fb9e70a12e68c2dff386b9a", + "sha256:5ec31adc2e988fd7db3ab509954791bbc5a452a03c85e45b804b4bfc31fa221d", + "sha256:69247f7a2835fc0984bbf0892e6022e9a36aec70e187fcfe6cae6a373eb8c4de", + "sha256:6a0ae7a48a6ef82ceb98a366948874834b86c84e288dbd55600c1abfc3ac1d88", + "sha256:6a1810c2cbde714decf40f811d1edc0dae45506eb37298fd9d4247b8801509fe", + "sha256:76013fef1c9cd1cd00d55efde516c154aa169f2bf059b197c263a255ba8a9ddf", + "sha256:77e0df59b1a4994ad30c6d746992ae887f9756a43fc25dec2db515d94cf0222d", + "sha256:7bb0e9049e81def6829d09558ad12d16d0454c26cabe6efc3658e544460688d9", + "sha256:88beb444fb438385e53dc9110852910ec2a22f0eab7dd489e827038fdc19ed8d", + "sha256:8b47ebd89e69fbf33d1c2df79759d7162fc80c7652dacfec136dae1c9b3afac7", + "sha256:8d219b4508f71d762368caec1fc180960569766049bbc4d38174f05e8ef2fe5b", + "sha256:8ec75f35f62571a43e31e7bd11749d974c1b5cd5ea4a8388725d579263c0fdf6", + "sha256:9167e735379ec43d8eafa3fd675bfbb12e2c0464f98960586e9447d2cf2c7a83", + "sha256:9a710c184ba845afb05a6f876edac8f27783ba70e52d5eaf939f121fc13b2f59", + "sha256:9aafd036f6f2e5ad109aec92f8dbfcbe76cff16bad683eb6dd18013739c0b3ae", + "sha256:9c79d597fb3a7c93d7c26924db7497eba06d58f88f58e586aa69b2ad89fee0f8", + "sha256:a2831e05ce0a4df10c4ac5399ef50b9a621f90894c2a4d2945dc5658765514ed", + "sha256:a5e641f931c5cd95b376fd3c59db52770e17bec2bf86ef16cc83b3906c054845", + "sha256:b10d8cda9fc2fcdcfa4a000aa10413a2bf8b575852cd07cb8a595ed09689ca98", + "sha256:b435b13bb8e36be11b75f7384a34eefe487fe87a6267172964628e2b14ecf0a7", + "sha256:b7b1a83ce514700276a46af3d9e481ec381f05b64939effc9065afe18456a6b9", + "sha256:b8729dbf25eb32ad0dc0b9bd5e6a0d0b7e5c2dc8ec06ad171088e1896b522a74", + "sha256:bbed8cccebe1169d45cedf00461b2842652d476d2897fd1c42cf41b635d88746", + "sha256:c258dbacfff1224f13576147df16ce3c02024a0d792fd0323ac01bed5d3c545d", + "sha256:c30a9e06041fbd7a7590693ec5e407aa8737ad91912a1e70176aff92e5c99d20", + "sha256:c91ea3915425bd4111cb1b74511cdc56d1d16a683a48bf2a5a96b6a6c0f297f7", + "sha256:d0355cff58a4ed6d5e5f6b9c3693f52de0784aa0c17119394e2a8e376ce489d4", + "sha256:d483793a384c550c2d12cb794ede294d303b42beff75f3b3081f57196660edaf", + "sha256:d4c2be9760b112b1caf649b4977b81b69893d75aa86caf4f0f398447be871f3c", + "sha256:d8e62d06e90f60ea2a3d463ae51401475568b995bafaffd81767d208d84d7bb1", + "sha256:da08ea09eefa6b960c2dd9a68ec47949235485c623621eb1d6c02b46765322ac", + "sha256:dd1fa413f8b9ba30140de198e4f408ffbba6396864c7554e0867aa7363eb58b2", + "sha256:e2aced6fb2f5261b47d267cb40060b73b6527e64afe54f6497844c9affed5fd0", + "sha256:e438417ce1dc5b758742e12661d800482200b042d03512a8f31f6aaa9137ad40", + "sha256:e470fa4bace5f50076c32f4b3cc182b31303b4fefb9b87f990144515d572820b", + "sha256:eaf2f65190c506def2581219572b9c70b8250615dc918b3b7c218361a51ec42e", + "sha256:ef102a67ede70e1721fe27f75073b5314911dbb9bc27cde0a1c402a11531e7bd", + "sha256:ef801027629c5b511cf2ba13b9be29bfee36ae834b2d95d9877818479cdc99ea", + "sha256:f7acc03a4f1154ba2643edeb13658d08598fe6e490c3dd96a241b94f09801626", + "sha256:f9756f1d25454ba6a3c2f1ef8b7ddec23e5cdeae3dc3c3377243ae37a383db00", + "sha256:ff62ba8ff70f01ab4fe0ae36b2cb0b5d1f42e73dfc81ddf0758cd9f77331ad25", + "sha256:ff925f1cca42e933376d09ddc254598f8c5fcd36efc5cac0118bb36c36217c41" + ], + "index": "pypi", + "markers": "python_version >= '3.7'", + "version": "==4.6.1" + }, + "pysocks": { + "hashes": [ + "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299", + "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", + "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0" + ], + "version": "==1.7.1" + }, + "python-dotenv": { + "hashes": [ + "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba", + "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==1.0.0" + }, + "selenium": { + "hashes": [ + "sha256:22eab5a1724c73d51b240a69ca702997b717eee4ba1f6065bf5d6b44dba01d48", + "sha256:9e82cd1ac647fb73cf0d4a6e280284102aaa3c9d94f0fa6e6cc4b5db6a30afbf" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==4.15.2" + }, + "sniffio": { + "hashes": [ + "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101", + "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384" + ], + "markers": "python_version >= '3.7'", + "version": "==1.3.0" + }, + "sortedcontainers": { + "hashes": [ + "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", + "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0" + ], + "version": "==2.4.0" + }, + "soupsieve": { + "hashes": [ + "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690", + "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7" + ], + "markers": "python_version >= '3.8'", + "version": "==2.5" + }, + "trio": { + "hashes": [ + "sha256:16f89f7dcc8f7b9dcdec1fcd863e0c039af6d0f9a22f8dfd56f75d75ec73fd48", + "sha256:bb4abb3f4af23f96679e7c8cdabb8b234520f2498550d2cf63ebfd95f2ce27fe" + ], + "markers": "python_version >= '3.8'", + "version": "==0.23.1" + }, + "trio-websocket": { + "hashes": [ + "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f", + "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638" + ], + "markers": "python_version >= '3.7'", + "version": "==0.11.1" + }, + "typer": { + "hashes": [ + "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2", + "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee" + ], + "index": "pypi", + "markers": "python_version >= '3.6'", + "version": "==0.9.0" + }, + "typing-extensions": { + "hashes": [ + "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0", + "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef" + ], + "markers": "python_version >= '3.8'", + "version": "==4.8.0" + }, + "urllib3": { + "extras": [ + "socks" + ], + "hashes": [ + "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3", + "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54" + ], + "markers": "python_version >= '3.8'", + "version": "==2.1.0" + }, + "wsproto": { + "hashes": [ + "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065", + "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736" + ], + "markers": "python_full_version >= '3.7.0'", + "version": "==1.2.0" + } + }, + "develop": { + "black": { + "hashes": [ + "sha256:250d7e60f323fcfc8ea6c800d5eba12f7967400eb6c2d21ae85ad31c204fb1f4", + "sha256:2a9acad1451632021ee0d146c8765782a0c3846e0e0ea46659d7c4f89d9b212b", + "sha256:412f56bab20ac85927f3a959230331de5614aecda1ede14b373083f62ec24e6f", + "sha256:421f3e44aa67138ab1b9bfbc22ee3780b22fa5b291e4db8ab7eee95200726b07", + "sha256:45aa1d4675964946e53ab81aeec7a37613c1cb71647b5394779e6efb79d6d187", + "sha256:4c44b7211a3a0570cc097e81135faa5f261264f4dfaa22bd5ee2875a4e773bd6", + "sha256:4c68855825ff432d197229846f971bc4d6666ce90492e5b02013bcaca4d9ab05", + "sha256:5133f5507007ba08d8b7b263c7aa0f931af5ba88a29beacc4b2dc23fcefe9c06", + "sha256:54caaa703227c6e0c87b76326d0862184729a69b73d3b7305b6288e1d830067e", + "sha256:58e5f4d08a205b11800332920e285bd25e1a75c54953e05502052738fe16b3b5", + "sha256:698c1e0d5c43354ec5d6f4d914d0d553a9ada56c85415700b81dc90125aac244", + "sha256:6c1cac07e64433f646a9a838cdc00c9768b3c362805afc3fce341af0e6a9ae9f", + "sha256:760415ccc20f9e8747084169110ef75d545f3b0932ee21368f63ac0fee86b221", + "sha256:7f622b6822f02bfaf2a5cd31fdb7cd86fcf33dab6ced5185c35f5db98260b055", + "sha256:cf57719e581cfd48c4efe28543fea3d139c6b6f1238b3f0102a9c73992cbb479", + "sha256:d136ef5b418c81660ad847efe0e55c58c8208b77a57a28a503a5f345ccf01394", + "sha256:dbea0bb8575c6b6303cc65017b46351dc5953eea5c0a59d7b7e3a2d2f433a911", + "sha256:fc7f6a44d52747e65a02558e1d807c82df1d66ffa80a601862040a43ec2e3142" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==23.11.0" + }, + "click": { + "hashes": [ + "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", + "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de" + ], + "markers": "python_version >= '3.7'", + "version": "==8.1.7" + }, + "flake8": { + "hashes": [ + "sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23", + "sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5" + ], + "index": "pypi", + "markers": "python_full_version >= '3.8.1'", + "version": "==6.1.0" + }, + "isort": { + "hashes": [ + "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504", + "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6" + ], + "index": "pypi", + "markers": "python_full_version >= '3.8.0'", + "version": "==5.12.0" + }, + "mccabe": { + "hashes": [ + "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", + "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e" + ], + "markers": "python_version >= '3.6'", + "version": "==0.7.0" + }, + "mypy-extensions": { + "hashes": [ + "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d", + "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.0" + }, + "packaging": { + "hashes": [ + "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5", + "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7" + ], + "markers": "python_version >= '3.7'", + "version": "==23.2" + }, + "pathspec": { + "hashes": [ + "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20", + "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3" + ], + "markers": "python_version >= '3.7'", + "version": "==0.11.2" + }, + "platformdirs": { + "hashes": [ + "sha256:118c954d7e949b35437270383a3f2531e99dd93cf7ce4dc8340d3356d30f173b", + "sha256:cb633b2bcf10c51af60beb0ab06d2f1d69064b43abf4c185ca6b28865f3f9731" + ], + "markers": "python_version >= '3.7'", + "version": "==4.0.0" + }, + "pycodestyle": { + "hashes": [ + "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f", + "sha256:44fe31000b2d866f2e41841b18528a505fbd7fef9017b04eff4e2648a0fadc67" + ], + "markers": "python_version >= '3.8'", + "version": "==2.11.1" + }, + "pyflakes": { + "hashes": [ + "sha256:4132f6d49cb4dae6819e5379898f2b8cce3c5f23994194c24b77d5da2e36f774", + "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc" + ], + "markers": "python_version >= '3.8'", + "version": "==3.1.0" + } + } +} diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt new file mode 100644 index 0000000..e0430c4 --- /dev/null +++ b/scrapers/requirements.txt @@ -0,0 +1,23 @@ +-i https://pypi.org/simple +attrs==23.1.0; python_version >= '3.7' +beautifulsoup4==4.12.2; python_full_version >= '3.6.0' +bs4==0.0.1 +certifi==2023.11.17; python_version >= '3.6' +click==8.1.7; python_version >= '3.7' +dnspython==2.4.2; python_version >= '3.8' and python_version < '4.0' +h11==0.14.0; python_version >= '3.7' +idna==3.6; python_version >= '3.5' +outcome==1.3.0.post0; python_version >= '3.7' +pymongo==4.6.1; python_version >= '3.7' +pysocks==1.7.1 +python-dotenv==1.0.0; python_version >= '3.8' +selenium==4.15.2; python_version >= '3.8' +sniffio==1.3.0; python_version >= '3.7' +sortedcontainers==2.4.0 +soupsieve==2.5; python_version >= '3.8' +trio==0.23.1; python_version >= '3.8' +trio-websocket==0.11.1; python_version >= '3.7' +typer==0.9.0; python_version >= '3.6' +typing-extensions==4.8.0; python_version >= '3.8' +urllib3[socks]==2.1.0; python_version >= '3.8' +wsproto==1.2.0; python_full_version >= '3.7.0' diff --git a/scrapers/scrapers.py b/scrapers/scrapers.py new file mode 100644 index 0000000..90cd562 --- /dev/null +++ b/scrapers/scrapers.py @@ -0,0 +1,45 @@ +import re + +import typer +from src import craigslist as cl +from src import database as db +from src import facebook as fb +from src import utils + +app = typer.Typer() + +craigslistScraperVersion = 1 +facebookScraperVersion = 1 + + +@app.command() +def craigslist(event, context): + utils.scrape("craigslist", craigslistScraperVersion) + + +@app.command() +def facebook(event, context): + utils.scrape("facebook", facebookScraperVersion) + + +@app.command() +def link(link: str): + clPattern = re.compile( + r"^https://[a-zA-Z-]+\.craigslist\.org(?:/[^\s?]*)?(?:\?[^\s]*)?$" + ) + fbPattern = re.compile( + r"^https://www\.facebook\.com/marketplace(?:/[^\s?]*)?(?:\?[^\s]*)?$" + ) + + if clPattern.match(link): + newInfo = cl.scrapeListing(link) + db.update(link, newInfo) + elif fbPattern.match(link): + newInfo = fb.scrapeListing(link) + print(newInfo) + else: + print("Not a Craigslist nor a Facebook Marketplace link") + + +if __name__ == "__main__": + app() diff --git a/scrapers/src/__init__.py b/scrapers/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapers/src/craigslist.py b/scrapers/src/craigslist.py new file mode 100644 index 0000000..13e30b9 --- /dev/null +++ b/scrapers/src/craigslist.py @@ -0,0 +1,148 @@ +import time + +from bs4 import BeautifulSoup + +from . import utils + + +def loadPageResources(driver): + scroll = 100 + + print("Waiting to load...") + time.sleep(2) + + utils.scrollTo(scroll, driver) + + loadImgButtons = driver.find_elements("class name", "slider-back-arrow") + + time.sleep(2) + + # Emulate a user scrolling + for i in range(len(loadImgButtons)): + scroll += 100 + utils.scrollTo(scroll, driver) + + utils.clickOn(loadImgButtons[i], driver) + + time.sleep(0.5) + + +def setupURLs(oldestAllowedCars): + # List of TX cities to scrape; can be expanded + cities = [ + "abilene", + "amarillo", + "austin", + "beaumont", + "brownsville", + "collegestation", + "corpuschristi", + "dallas", + "nacogdoches", + "delrio", + "elpaso", + "galveston", + "houston", + "killeen", + "laredo", + "lubbock", + "mcallen", + "odessa", + "sanangelo", + "sanantonio", + "sanmarcos", + "bigbend", + "texoma", + "easttexas", + "victoriatx", + "waco", + "wichitafalls", + ] + + # Set the URL of the Facebook Marketplace automotive category + base_url = ( + "https://{}.craigslist.org/search/cta?min_auto_year={}#search=1~gallery~0~0" + ) + return [base_url.format(city, oldestAllowedCars) for city in cities] + + +def getAllPosts(browser): + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, "html.parser") + + # Find all of the car listings on the page + return soup.find_all("div", class_="gallery-card") + + +def getCarInfo(post): + title = post.find("span", class_="label").text + + print(f'Scraping "{title}"') + + price = post.find("span", class_="priceinfo").text + metadata = post.find("div", class_="meta").text.split("ยท") + + odometer = metadata[1] + if len(metadata) >= 3: + location = metadata[2] + + link = post.find("a", class_="posting-title", href=True)["href"] + + imageElements = post.findAll("img") + images = [img["src"] for img in imageElements] + + return title, price, location, odometer, link, images + + +def processAttributes(attributes): + processedAttributes = [] + + for attr in attributes: + [label, value] = attr.split(": ") + processedAttributes.append( + {"label": label.replace(" ", "-").lower(), "value": value} + ) + + return processedAttributes + + +def scrapeListing(url): + browser = utils.setupBrowser() + + # Navigate to the URL + print(f"Going to {url}") + browser.get(url) + + print(f"Loading page for {url}") + time.sleep(1) + + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, "html.parser") + + try: + description = soup.find("section", id="postingbody").text + attributes = processAttributes( + [ + attr.text + for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span") + ] + ) + map = soup.find("div", id="map") + + car = { + "postBody": description, + "longitude": map["data-longitude"], + "latitude": map["data-latitude"], + } + + for attr in attributes: + car[attr["label"]] = attr["value"] + + return car + except Exception as e: + print(f"Failed scraping {url}: \n{e}") + + # Close the Selenium WebDriver instance + browser.quit() diff --git a/scrapers/src/database.py b/scrapers/src/database.py new file mode 100644 index 0000000..1c88567 --- /dev/null +++ b/scrapers/src/database.py @@ -0,0 +1,89 @@ +import os +from datetime import date + +import pymongo +from dotenv import load_dotenv + +db = "scrape" +collection = "scraped_raw" + + +def get_conn(db): + # load environment variable containing db uri (which includes username and password) + load_dotenv() + db_uri = os.environ.get("DB_URI") + + # create a mongodb connection + try: + client = pymongo.MongoClient(db_uri) + + # return a friendly error if a URI error is thrown + except pymongo.errors.ConfigurationError: + print( + "An Invalid URI host error was received." + " Is your Atlas host name correct in your connection string (found the .env)?" + ) + return {"success": False, "db": 0} + + return {"success": True, "db": client.get_database(db)} + + +def post_raw( + scraperVersion, + source, + title, + price, + location, + miles, + link, + images=None, + postBody=None, + longitude=None, + latitude=None, + attributes=None, +): + car = { + "_id": link, + "source": source, + "scraper-version": scraperVersion, + "scrape-date": str(date.today()), + "title": title, + "price": price, + "location": location, + "odometer": miles, + "link": link, + } + + if images is not None: + car["images"] = images + + if postBody is not None: + car["postBody"] = postBody + + if longitude is not None: + car["longitude"] = longitude + + if latitude is not None: + car["latitude"] = latitude + + if attributes is not None: + for attr in attributes: + car[attr["label"]] = attr["value"] + + # Insert into collection called "scrape_raw" + conn = get_conn(db) + + if conn["success"]: + result = conn["db"][collection].insert_one(car) + return result.acknowledged + else: + return False + + +def update(link, newFields): + conn = get_conn(db) + if conn["success"]: + result = conn["db"][collection].update_one({"_id": link}, {"$set": newFields}) + return result.acknowledged + else: + return False diff --git a/scrapers/src/facebook.py b/scrapers/src/facebook.py new file mode 100644 index 0000000..61aef28 --- /dev/null +++ b/scrapers/src/facebook.py @@ -0,0 +1,156 @@ +import time + +from bs4 import BeautifulSoup + +from . import utils + +postClass = ( + "x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4" + " x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24" +) +linkClass = ( + "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619" + "x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r" + " xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq" + " x1a2a7pz x1heor9g x1lku1pv" +) +thumbnailClass = "xt7dq6l xl1xv1r x6ikm8r x10wlt62 xh8yej3" +titleClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6" +priceClass = ( + "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel" + " x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i" + " x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7" + " x1s688f xzsf02u" +) +metaClass = "x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft" + +listingInfoClass = "x78zum5 xdt5ytf x1iyjqo2 x1n2onr6" +listingSectionClass = "xod5an3" +bodyClass = ( + "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx" + " x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty" + " x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u" +) + + +def loadPageResources(driver): + scroll = 100 + + print("Waiting to load...") + time.sleep(2) + utils.scrollTo(scroll, driver) + time.sleep(1.5) + + # Emulate a user scrolling + for i in range(10): + scroll += 1000 + utils.scrollTo(scroll, driver) + time.sleep(1) + + +def setupURLs(oldestAllowedCars): + # List of TX cities to scrape; can be expanded + cities = ["houston", "dallas", "austin", "fortworth", "elpaso", "sanantonio"] + + # Set the URL of the Facebook Marketplace automotive category + base_url = "https://www.facebook.com/marketplace/{}/vehicles?minYear={}&exact=false" + return [base_url.format(city, oldestAllowedCars) for city in cities] + + +def getAllPosts(browser): + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, "html.parser") + + # Find all of the car listings on the page + return soup.find_all("div", class_=postClass) + + +def getCarInfo(post): + title = post.find("span", class_=titleClass).text + + print(f'Scraping "{title}"') + + price = post.find("span", class_=priceClass).text + metadata = post.findAll("span", class_=metaClass) + + location = metadata[0].text + odometer = metadata[1].text + + link = post.find("a", class_=linkClass, href=True)["href"] + link = "https://facebook.com" + link + + thumbnail = post.find("img", class_=thumbnailClass)["src"] + + return title, price, location, odometer, link, [thumbnail] + + +def getCarImages(): + # class="x1a0syf3 x1ja2u2z" + return "TODO" + + +def processAttributes(attributes): + processedAttributes = [] + + for attr in attributes: + [label, value] = attr.split(": ") + processedAttributes.append({"label": label, "value": value}) + + return processedAttributes + + +def scrapeListing(url): + browser = utils.setupBrowser() + + # Navigate to the URL + print(f"Going to {url[0:60]}") + browser.get(url[0:60]) + + print(f"Loading page for {url[0:60]}") + time.sleep(1) + + # Create a BeautifulSoup object from the HTML of the page + html = browser.page_source + soup = BeautifulSoup(html, "html.parser") + + try: + seeMoreButton = browser.find_element( + "class name", + "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x6prxxf xvq8zen x1s688f xzsf02u".replace( + " ", "." + ), + ) + utils.clickOn(seeMoreButton, browser) + + listingInfo = soup.find("div", class_=listingInfoClass) + # description = listingInfo.find( + # "span", + # class_=( + # "x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq" + # " x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m" + # " x6prxxf xvq8zen xo1l8bm xzsf02u" + # ), + # ) + print(listingInfo) + + return 2 + + # attributes = processAttributes( + # [ + # attr.text + # for attr in soup.findAll("p", class_="attrgroup")[1].findAll("span") + # ] + # ) + + # map = soup.find('div', id='map') + # longitude = map["data-longitude"] + # latitude = map["data-latitude"] + + # print([attributes, description, longitude, latitude]) + except Exception as error: + print(error) + return -1 + + # Close the Selenium WebDriver instance + browser.quit() diff --git a/scrapers/src/utils.py b/scrapers/src/utils.py new file mode 100644 index 0000000..399e64e --- /dev/null +++ b/scrapers/src/utils.py @@ -0,0 +1,85 @@ +from selenium import webdriver + +from . import craigslist +from . import database as db +from . import facebook + + +def scrollTo(x, driver): + driver.execute_script( + f"window.scrollTo({{top: {x}, left: 100, behavior: 'smooth'}})" + ) + + +def clickOn(elem, driver): + driver.execute_script("arguments[0].click();", elem) + + +def createDriverOptions(): + options = webdriver.ChromeOptions() + options.binary_location = "/opt/chrome/chrome" + + options.add_argument("--headless=new") + options.add_argument("--headless=new") + options.add_argument("--no-sandbox") + options.add_argument("--disable-gpu") + options.add_argument("--window-size=1280x1696") + options.add_argument("--single-process") + options.add_argument("--disable-dev-shm-usage") + options.add_argument("--disable-dev-tools") + options.add_argument("--no-zygote") + + return options + + +def setupBrowser(): + print("Setting up headless browser") + + service = webdriver.ChromeService("/opt/chromedriver") + options = createDriverOptions() + + print("Creating a new Selenium WebDriver instance") + return webdriver.Chrome(options=options, service=service) + + +def scrape(website, scraperVersion): + if website == "craigslist": + scraper = craigslist + elif website == "facebook": + scraper = facebook + + cityURLs = scraper.setupURLs(2011) + browser = setupBrowser() + + for url in cityURLs: + print(f"Going to {url}") + browser.get(url) + + print(f"Loading cars from {url}") + scraper.loadPageResources(browser) + + carPosts = scraper.getAllPosts(browser) + + for post in carPosts: + try: + title, price, location, odometer, link, images = scraper.getCarInfo( + post + ) + success = db.post_raw( + scraperVersion, + website, + title, + price, + location, + odometer, + link, + images, + ) + if success: + print("posted to db") + else: + print("failed to post to db") + except Exception as error: + print(error) + + browser.quit() diff --git a/src/scrapers/craigslist.py b/src/scrapers/craigslist.py deleted file mode 100644 index cb6ad22..0000000 --- a/src/scrapers/craigslist.py +++ /dev/null @@ -1,86 +0,0 @@ -from datetime import datetime -import csv -import json -import requests - -location_to_batch = { - "newyork": "3-0-360-0-0", - "philadelphia": "17-0-360-0-0", - "dallas": "21-0-360-0-0", - # Add more locations and their batch values as needed -} - -def fetch_job_postings(location, category): - base_url = "https://sapi.craigslist.org/web/v8/postings/search/full" - - # Get the batch value and category abbreviation from the mappings - # Default to New York if location not found - batch = location_to_batch.get(location) - - params = { - 'batch': batch, - 'cc': 'US', - 'lang': 'en', - 'searchPath': "cta", - "id": "0", - "collectContactInfo": True, - } - - headers = { - 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', - 'Referer': f'https://{location}.craigslist.org/', - 'sec-ch-ua-mobile': '?0', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', - 'sec-ch-ua-platform': '"Windows"', - 'Cookie': f'cl_b=COOKIE VALUE' - } - - response = requests.get(base_url, params=params, headers=headers) - - if response.status_code == 200: - data = response.json() - else: - print("Failed to retrieve data. Status code:", response.status_code) - data = None - - job_postings = [] - with open('file.txt', 'w') as f: - json.dump(data, f, indent=2) - - if data: - for item in data["data"]["items"]: - job_title = None - commission = None - for element in item: - if isinstance(element, str): - job_title = element - elif isinstance(element, list) and len(element) > 0 and element[0] == 7: - commission = element[1] - if job_title and commission: - job_postings.append((job_title, commission)) - return job_postings - - else: - print("No data available.") - -if __name__ == "__main__": - location = "dallas" - category = "cta" - - job_postings = fetch_job_postings(location, category) - - if job_postings: - current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S") - category = category.replace("/", "&") - csv_filename = f"{location}_{category}_openings_{current_datetime}.csv" - - with open(csv_filename, mode='w', newline='', encoding='utf-8') as file: - writer = csv.writer(file) - - writer.writerow(["Job Title", "Commission"]) - for job in job_postings: - writer.writerow([job[0], job[1]]) - - print(f"Job postings have been saved to {csv_filename}") - else: - print("No data available.") \ No newline at end of file diff --git a/src/scrapers/database.py b/src/scrapers/database.py deleted file mode 100644 index 49c463c..0000000 --- a/src/scrapers/database.py +++ /dev/null @@ -1,57 +0,0 @@ -from dotenv import load_dotenv -import pymongo -import os -from datetime import date - -def get_conn(db): - # load environment variable containing db uri (which includes username and password) - load_dotenv() - db_uri = os.getenv("DB_URI") - - # create a mongodb connection - try: - client = pymongo.MongoClient(db_uri) - - # return a friendly error if a URI error is thrown - except pymongo.errors.ConfigurationError: - print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?") - return {"success" : False, "db": 0} - - # use a database named "test" - return {"success" : True, "db": client.get_database(db)} - -def post_raw(source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None): - car = { - "title": title, - "price": price, - "location": location, - "odometer": miles, - "link": link, - "source": source, - "scrapeDate": str(date.today()) - } - - if (images is not None): - car["images"] = images - - if (postBody is not None): - car["postBody"] = postBody - - if (longitude is not None): - car["longitude"] = longitude - - if (latitude is not None): - car["latitude"] = latitude - - if (attributes is not None): - for attr in attributes: - car[attr["label"]] = attr["value"] - - # Insert into collection called "scrape_test" - conn = get_conn("scrape") - - if (conn["success"]): - result = conn["db"]["scraped_raw"].insert_one(car) - return result.acknowledged - else: - return False \ No newline at end of file diff --git a/src/scrapers/facebook.py b/src/scrapers/facebook.py deleted file mode 100644 index 00b5e45..0000000 --- a/src/scrapers/facebook.py +++ /dev/null @@ -1,111 +0,0 @@ -from selenium import webdriver -from bs4 import BeautifulSoup -from selenium.webdriver.chrome.options import Options -import time - -import database - - -#list of cities to scrape; can be expanded -cities = [ - 'nyc', 'la', 'chicago', 'houston', 'miami', - 'philadelphia', 'phoenix', 'sanantonio', 'sandiego', 'dallas', - 'sanjose', 'austin', 'jacksonville', 'fortworth', 'columbus', - 'charlotte', 'sanfrancisco', 'indianapolis', 'seattle', 'denver', - 'washington', 'boston', 'elpaso', 'nashville', 'detroit', 'portland', 'lasvegas', 'memphis', 'louisville', - 'baltimore', 'milwaukee', 'albuquerque', 'tucson', 'fresno', - 'kansascity', 'mesa', 'atlanta', - 'coloradosprings', 'virginiabeach', 'raleigh', 'omaha', 'miami', - 'oakland', 'minneapolis', 'tulsa', 'wichita', 'neworleans' -] - -# Set the URL of the Facebook Marketplace automotive category -base_url = 'https://www.facebook.com/marketplace/{}/vehicles' -urls = [base_url.format(city) for city in cities] - -# Create a new Selenium WebDriver instance - -print("Setting up headless browser") -options = Options() -options.add_argument("--headless=new") - -print("Creating a new Selenium WebDriver instance") -driver = webdriver.Chrome(options=options) - -# Create a list to store the scraped data -print("Started scraping...") -data = {} -for url in urls: - # Navigate to the URL - print(f"Navigating to {url}") - driver.get(url) - - print(f"Loading {url}") - - time.sleep(2) - scroll = 2000 - - # Wait for the page to load - time.sleep(2) - - for i in range(50): - driver.execute_script(f"window.scrollTo(1, {scroll})") - scroll += 1000 - time.sleep(.5) - - # Get the HTML of the page - html = driver.page_source - - # Create a BeautifulSoup object from the HTML - soup = BeautifulSoup(html, 'html.parser') - - # Find all of the automotive listings on the page - car_posts = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24') - - # Iterate over the listings and scrape the data - for post in car_posts: - print("Scraping new listing") - try: - # Get the title of the listing - title = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text - except AttributeError: - title = 'N/A' # Handle missing title - - try: - # Get the price of the listing - price = post.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text - except AttributeError: - price = 'N/A' # Handle missing price - - try: - # Get the location of the listing - location = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text - except AttributeError: - location = 'N/A' # Handle missing location - - try: - # Get the miles of the car - miles = post.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text - except (AttributeError, IndexError): - miles = 'N/A' # Handle missing miles - - try: - # Get the link to the listing - link = 'https://www.facebook.com' + post.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href'] - except (AttributeError, TypeError): - link = 'N/A' # Handle missing link - - # Add the data to the list - if (title, price, location, miles, link) not in data: - data[(title, price, location, miles, link)] = True - postSuccess = database.post_raw("facebook", title, price, location, miles, link) - if (postSuccess): - print("Save to DB") - else: - print("Failed to save to DB") - else: - print("Listing is a duplicate") - - -# Close the Selenium WebDriver instance -driver.quit() \ No newline at end of file