diff --git a/poetry.lock b/poetry.lock index cde8eae07..d1fc0c3b3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -302,6 +302,24 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope-interface"] tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +[[package]] +name = "automat" +version = "22.10.0" +description = "Self-service finite-state machines for the programmer on the go." +optional = false +python-versions = "*" +files = [ + {file = "Automat-22.10.0-py2.py3-none-any.whl", hash = "sha256:c3164f8742b9dc440f3682482d32aaff7bb53f71740dd018533f9de286b64180"}, + {file = "Automat-22.10.0.tar.gz", hash = "sha256:e56beb84edad19dcc11d30e8d9b895f75deeb5ef5e96b84a467066b3b84bb04e"}, +] + +[package.dependencies] +attrs = ">=19.2.0" +six = "*" + +[package.extras] +visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"] + [[package]] name = "azure-core" version = "1.29.4" @@ -880,6 +898,17 @@ json = ["jsonschema", "pyrsistent", "pyrsistent (==0.16.1)", "requests"] protobuf = ["protobuf", "requests"] schema-registry = ["requests"] +[[package]] +name = "constantly" +version = "23.10.4" +description = "Symbolic constants in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "constantly-23.10.4-py3-none-any.whl", hash = "sha256:3fd9b4d1c3dc1ec9757f3c52aef7e53ad9323dbe39f51dfd4c43853b68dfa3f9"}, + {file = "constantly-23.10.4.tar.gz", hash = "sha256:aa92b70a33e2ac0bb33cd745eb61776594dc48764b06c35e0efd050b7f1c7cbd"}, +] + [[package]] name = "cryptography" version = "41.0.4" @@ -925,6 +954,17 @@ ssh = ["bcrypt (>=3.1.5)"] test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] +[[package]] +name = "cssselect" +version = "1.2.0" +description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"}, + {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"}, +] + [[package]] name = "curlify" version = "2.2.1" @@ -2216,6 +2256,20 @@ files = [ [package.extras] tests = ["freezegun", "pytest", "pytest-cov"] +[[package]] +name = "hyperlink" +version = "21.0.0" +description = "A featureful, immutable, and correct URL for Python." +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "hyperlink-21.0.0-py2.py3-none-any.whl", hash = "sha256:e6b14c37ecb73e89c77d78cdb4c2cc8f3fb59a885c5b3f819ff4ed80f25af1b4"}, + {file = "hyperlink-21.0.0.tar.gz", hash = "sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b"}, +] + +[package.dependencies] +idna = ">=2.5" + [[package]] name = "idna" version = "3.4" @@ -2246,6 +2300,21 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker perf = ["ipython"] testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] +[[package]] +name = "incremental" +version = "22.10.0" +description = "\"A small library that versions your Python projects.\"" +optional = false +python-versions = "*" +files = [ + {file = "incremental-22.10.0-py2.py3-none-any.whl", hash = "sha256:b864a1f30885ee72c5ac2835a761b8fe8aa9c28b9395cacf27286602688d3e51"}, + {file = "incremental-22.10.0.tar.gz", hash = "sha256:912feeb5e0f7e0188e6f42241d2f450002e11bbc0937c65865045854c24c0bd0"}, +] + +[package.extras] +mypy = ["click (>=6.0)", "mypy (==0.812)", "twisted (>=16.4.0)"] +scripts = ["click (>=6.0)", "twisted (>=16.4.0)"] + [[package]] name = "inflection" version = "0.5.1" @@ -2282,6 +2351,34 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "itemadapter" +version = "0.8.0" +description = "Common interface for data container classes" +optional = false +python-versions = ">=3.7" +files = [ + {file = "itemadapter-0.8.0-py3-none-any.whl", hash = "sha256:2ac1fbcc363b789a18639935ca322e50a65a0a7dfdd8d973c34e2c468e6c0f94"}, + {file = "itemadapter-0.8.0.tar.gz", hash = "sha256:77758485fb0ac10730d4b131363e37d65cb8db2450bfec7a57c3f3271f4a48a9"}, +] + +[[package]] +name = "itemloaders" +version = "1.1.0" +description = "Base library for scrapy's ItemLoader" +optional = false +python-versions = ">=3.7" +files = [ + {file = "itemloaders-1.1.0-py3-none-any.whl", hash = "sha256:c8c82fe0c11fc4cdd08ec04df0b3c43f3cb7190002edb517e02d55de8efc2aeb"}, + {file = "itemloaders-1.1.0.tar.gz", hash = "sha256:21d81c61da6a08b48e5996288cdf3031c0f92e5d0075920a0242527523e14a48"}, +] + +[package.dependencies] +itemadapter = ">=0.1.0" +jmespath = ">=0.9.5" +parsel = ">=1.5.0" +w3lib = ">=1.17.0" + [[package]] name = "jmespath" version = "1.0.1" @@ -3281,6 +3378,24 @@ files = [ numpy = ">=1.24.3" types-pytz = ">=2022.1.1" +[[package]] +name = "parsel" +version = "1.8.1" +description = "Parsel is a library to extract data from HTML and XML using XPath and CSS selectors" +optional = false +python-versions = ">=3.7" +files = [ + {file = "parsel-1.8.1-py2.py3-none-any.whl", hash = "sha256:2708fc74daeeb4ce471e2c2e9089b650ec940c7a218053e57421e69b5b00f82c"}, + {file = "parsel-1.8.1.tar.gz", hash = "sha256:aff28e68c9b3f1a901db2a4e3f158d8480a38724d7328ee751c1a4e1c1801e39"}, +] + +[package.dependencies] +cssselect = ">=0.9" +jmespath = "*" +lxml = "*" +packaging = "*" +w3lib = ">=1.19.0" + [[package]] name = "pathspec" version = "0.11.2" @@ -3548,6 +3663,17 @@ dev = ["black", "flake8", "flake8-print", "isort", "pre-commit"] sentry = ["django", "sentry-sdk"] test = ["coverage", "flake8", "freezegun (==0.3.15)", "mock (>=2.0.0)", "pylint", "pytest"] +[[package]] +name = "protego" +version = "0.3.0" +description = "Pure-Python robots.txt parser with support for modern conventions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "Protego-0.3.0-py2.py3-none-any.whl", hash = "sha256:db38f6a945839d8162a4034031a21490469566a2726afb51d668497c457fb0aa"}, + {file = "Protego-0.3.0.tar.gz", hash = "sha256:04228bffde4c6bcba31cf6529ba2cfd6e1b70808fdc1d2cb4301be6b28d6c568"}, +] + [[package]] name = "proto-plus" version = "1.22.3" @@ -3620,7 +3746,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, @@ -3729,6 +3854,17 @@ all = ["apache-bookkeeper-client (>=4.16.1)", "fastavro (==1.7.3)", "grpcio (>=1 avro = ["fastavro (==1.7.3)"] functions = ["apache-bookkeeper-client (>=4.16.1)", "grpcio (>=1.8.2)", "prometheus-client", "protobuf (>=3.6.1,<=3.20.3)", "ratelimit"] +[[package]] +name = "py" +version = "1.11.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, + {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, +] + [[package]] name = "pyairtable" version = "2.1.0.post1" @@ -3900,6 +4036,20 @@ typing-extensions = ">=4.2.0" dotenv = ["python-dotenv (>=0.10.4)"] email = ["email-validator (>=1.0.3)"] +[[package]] +name = "pydispatcher" +version = "2.0.7" +description = "Multi-producer multi-consumer in-memory signal dispatch system" +optional = false +python-versions = "*" +files = [ + {file = "PyDispatcher-2.0.7-py3-none-any.whl", hash = "sha256:96543bea04115ffde08f851e1d45cacbfd1ee866ac42127d9b476dc5aefa7de0"}, + {file = "PyDispatcher-2.0.7.tar.gz", hash = "sha256:b777c6ad080dc1bad74a4c29d6a46914fa6701ac70f94b0d66fbcfde62f5be31"}, +] + +[package.extras] +dev = ["tox"] + [[package]] name = "pyflakes" version = "3.1.0" @@ -4062,6 +4212,24 @@ files = [ ed25519 = ["PyNaCl (>=1.4.0)"] rsa = ["cryptography"] +[[package]] +name = "pyopenssl" +version = "23.2.0" +description = "Python wrapper module around the OpenSSL library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pyOpenSSL-23.2.0-py3-none-any.whl", hash = "sha256:24f0dc5227396b3e831f4c7f602b950a5e9833d292c8e4a2e06b709292806ae2"}, + {file = "pyOpenSSL-23.2.0.tar.gz", hash = "sha256:276f931f55a452e7dea69c7173e984eb2a4407ce413c918aa34b55f82f9b8bac"}, +] + +[package.dependencies] +cryptography = ">=38.0.0,<40.0.0 || >40.0.0,<40.0.1 || >40.0.1,<42" + +[package.extras] +docs = ["sphinx (!=5.2.0,!=5.2.0.post0)", "sphinx-rtd-theme"] +test = ["flaky", "pretend", "pytest (>=3.0.1)"] + [[package]] name = "pypandoc" version = "1.11" @@ -4108,6 +4276,16 @@ docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] full = ["Pillow", "PyCryptodome"] image = ["Pillow"] +[[package]] +name = "pypydispatcher" +version = "2.1.2" +description = "Multi-producer-multi-consumer signal dispatching mechanism" +optional = false +python-versions = "*" +files = [ + {file = "PyPyDispatcher-2.1.2.tar.gz", hash = "sha256:b6bec5dfcff9d2535bca2b23c80eae367b1ac250a645106948d315fcfa9130f2"}, +] + [[package]] name = "pyreadline3" version = "3.4.1" @@ -4141,6 +4319,38 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-forked" +version = "1.6.0" +description = "run tests in isolated forked subprocesses" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-forked-1.6.0.tar.gz", hash = "sha256:4dafd46a9a600f65d822b8f605133ecf5b3e1941ebb3588e943b4e3eb71a5a3f"}, + {file = "pytest_forked-1.6.0-py3-none-any.whl", hash = "sha256:810958f66a91afb1a1e2ae83089d8dc1cd2437ac96b12963042fbb9fb4d16af0"}, +] + +[package.dependencies] +py = "*" +pytest = ">=3.10" + +[[package]] +name = "pytest-mock" +version = "3.12.0" +description = "Thin-wrapper around the mock package for easier use with pytest" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-mock-3.12.0.tar.gz", hash = "sha256:31a40f038c22cad32287bb43932054451ff5583ff094bca6f675df2f8bc1a6e9"}, + {file = "pytest_mock-3.12.0-py3-none-any.whl", hash = "sha256:0972719a7263072da3a21c7f4773069bcc7486027d7e8e1f81d98a47e701bc4f"}, +] + +[package.dependencies] +pytest = ">=5.0" + +[package.extras] +dev = ["pre-commit", "pytest-asyncio", "tox"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -4281,7 +4491,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -4316,6 +4525,17 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "queuelib" +version = "1.6.2" +description = "Collection of persistent (disk-based) and non-persistent (memory-based) queues" +optional = false +python-versions = ">=3.5" +files = [ + {file = "queuelib-1.6.2-py2.py3-none-any.whl", hash = "sha256:4b96d48f650a814c6fb2fd11b968f9c46178b683aad96d68f930fe13a8574d19"}, + {file = "queuelib-1.6.2.tar.gz", hash = "sha256:4b207267f2642a8699a1f806045c56eb7ad1a85a10c0e249884580d139c2fcd2"}, +] + [[package]] name = "regex" version = "2023.10.3" @@ -4567,6 +4787,37 @@ fsspec = "2023.9.2" awscli = ["aiobotocore[awscli] (>=2.5.4,<2.6.0)"] boto3 = ["aiobotocore[boto3] (>=2.5.4,<2.6.0)"] +[[package]] +name = "scrapy" +version = "2.11.1" +description = "A high-level Web Crawling and Web Scraping framework" +optional = false +python-versions = ">=3.8" +files = [ + {file = "Scrapy-2.11.1-py2.py3-none-any.whl", hash = "sha256:f1edee0cd214512054c01a8d031a8d213dddb53492b02c9e66256e3efe90d175"}, + {file = "Scrapy-2.11.1.tar.gz", hash = "sha256:733a039c7423e52b69bf2810b5332093d4e42a848460359c07b02ecff8f73ebe"}, +] + +[package.dependencies] +cryptography = ">=36.0.0" +cssselect = ">=0.9.1" +itemadapter = ">=0.1.0" +itemloaders = ">=1.0.1" +lxml = ">=4.4.1" +packaging = "*" +parsel = ">=1.5.0" +protego = ">=0.1.15" +PyDispatcher = {version = ">=2.0.5", markers = "platform_python_implementation == \"CPython\""} +pyOpenSSL = ">=21.0.0" +PyPyDispatcher = {version = ">=2.1.0", markers = "platform_python_implementation == \"PyPy\""} +queuelib = ">=1.4.2" +service-identity = ">=18.1.0" +setuptools = "*" +tldextract = "*" +Twisted = ">=18.9.0" +w3lib = ">=1.17.0" +"zope.interface" = ">=5.1.0" + [[package]] name = "semver" version = "3.0.2" @@ -4578,6 +4829,30 @@ files = [ {file = "semver-3.0.2.tar.gz", hash = "sha256:6253adb39c70f6e51afed2fa7152bcd414c411286088fb4b9effb133885ab4cc"}, ] +[[package]] +name = "service-identity" +version = "24.1.0" +description = "Service identity verification for pyOpenSSL & cryptography." +optional = false +python-versions = ">=3.8" +files = [ + {file = "service_identity-24.1.0-py3-none-any.whl", hash = "sha256:a28caf8130c8a5c1c7a6f5293faaf239bbfb7751e4862436920ee6f2616f568a"}, + {file = "service_identity-24.1.0.tar.gz", hash = "sha256:6829c9d62fb832c2e1c435629b0a8c476e1929881f28bee4d20bc24161009221"}, +] + +[package.dependencies] +attrs = ">=19.1.0" +cryptography = "*" +pyasn1 = "*" +pyasn1-modules = "*" + +[package.extras] +dev = ["pyopenssl", "service-identity[idna,mypy,tests]"] +docs = ["furo", "myst-parser", "pyopenssl", "sphinx", "sphinx-notfound-page"] +idna = ["idna"] +mypy = ["idna", "mypy", "types-pyopenssl"] +tests = ["coverage[toml] (>=5.0.2)", "pytest"] + [[package]] name = "setuptools" version = "68.2.2" @@ -4972,6 +5247,26 @@ requests = ">=2.26.0" [package.extras] blobfile = ["blobfile (>=2)"] +[[package]] +name = "tldextract" +version = "5.1.1" +description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." +optional = false +python-versions = ">=3.8" +files = [ + {file = "tldextract-5.1.1-py3-none-any.whl", hash = "sha256:b9c4510a8766d377033b6bace7e9f1f17a891383ced3c5d50c150f181e9e1cc2"}, + {file = "tldextract-5.1.1.tar.gz", hash = "sha256:9b6dbf803cb5636397f0203d48541c0da8ba53babaf0e8a6feda2d88746813d4"}, +] + +[package.dependencies] +filelock = ">=3.0.8" +idna = "*" +requests = ">=2.1.0" +requests-file = ">=1.4" + +[package.extras] +testing = ["black", "mypy", "pytest", "pytest-gitignore", "pytest-mock", "responses", "ruff", "tox", "types-filelock", "types-requests"] + [[package]] name = "tokenizers" version = "0.14.1" @@ -5129,6 +5424,72 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "twisted" +version = "22.10.0" +description = "An asynchronous networking framework written in Python" +optional = false +python-versions = ">=3.7.1" +files = [ + {file = "Twisted-22.10.0-py3-none-any.whl", hash = "sha256:86c55f712cc5ab6f6d64e02503352464f0400f66d4f079096d744080afcccbd0"}, + {file = "Twisted-22.10.0.tar.gz", hash = "sha256:32acbd40a94f5f46e7b42c109bfae2b302250945561783a8b7a059048f2d4d31"}, +] + +[package.dependencies] +attrs = ">=19.2.0" +Automat = ">=0.8.0" +constantly = ">=15.1" +hyperlink = ">=17.1.1" +incremental = ">=21.3.0" +twisted-iocpsupport = {version = ">=1.0.2,<2", markers = "platform_system == \"Windows\""} +typing-extensions = ">=3.6.5" +"zope.interface" = ">=4.4.2" + +[package.extras] +all-non-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] +conch = ["appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "cryptography (>=2.6)", "pyasn1"] +conch-nacl = ["PyNaCl", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "cryptography (>=2.6)", "pyasn1"] +contextvars = ["contextvars (>=2.4,<3)"] +dev = ["coverage (>=6b1,<7)", "pydoctor (>=22.9.0,<22.10.0)", "pyflakes (>=2.2,<3.0)", "python-subunit (>=1.4,<2.0)", "readthedocs-sphinx-ext (>=2.1,<3.0)", "sphinx (>=5.0,<6)", "sphinx-rtd-theme (>=1.0,<2.0)", "towncrier (>=22.8,<23.0)", "twistedchecker (>=0.7,<1.0)"] +dev-release = ["pydoctor (>=22.9.0,<22.10.0)", "readthedocs-sphinx-ext (>=2.1,<3.0)", "sphinx (>=5.0,<6)", "sphinx-rtd-theme (>=1.0,<2.0)", "towncrier (>=22.8,<23.0)"] +gtk-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pygobject", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] +http2 = ["h2 (>=3.0,<5.0)", "priority (>=1.1.0,<2.0)"] +macos-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyobjc-core", "pyobjc-framework-CFNetwork", "pyobjc-framework-Cocoa", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] +mypy = ["PyHamcrest (>=1.9.0)", "PyNaCl", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "coverage (>=6b1,<7)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "mypy (==0.930)", "mypy-zope (==0.3.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pydoctor (>=22.9.0,<22.10.0)", "pyflakes (>=2.2,<3.0)", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "python-subunit (>=1.4,<2.0)", "pywin32 (!=226)", "readthedocs-sphinx-ext (>=2.1,<3.0)", "service-identity (>=18.1.0)", "sphinx (>=5.0,<6)", "sphinx-rtd-theme (>=1.0,<2.0)", "towncrier (>=22.8,<23.0)", "twistedchecker (>=0.7,<1.0)", "types-pyOpenSSL", "types-setuptools"] +osx-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyobjc-core", "pyobjc-framework-CFNetwork", "pyobjc-framework-Cocoa", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] +serial = ["pyserial (>=3.0)", "pywin32 (!=226)"] +test = ["PyHamcrest (>=1.9.0)", "cython-test-exception-raiser (>=1.0.2,<2)", "hypothesis (>=6.0,<7.0)"] +tls = ["idna (>=2.4)", "pyopenssl (>=21.0.0)", "service-identity (>=18.1.0)"] +windows-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] + +[[package]] +name = "twisted-iocpsupport" +version = "1.0.4" +description = "An extension for use in the twisted I/O Completion Ports reactor." +optional = false +python-versions = "*" +files = [ + {file = "twisted-iocpsupport-1.0.4.tar.gz", hash = "sha256:858096c0d15e33f15ac157f455d8f86f2f2cdd223963e58c0f682a3af8362d89"}, + {file = "twisted_iocpsupport-1.0.4-cp310-cp310-win32.whl", hash = "sha256:afa2b630797f9ed2f27f3d9f55e3f72b4244911e45a8c82756f44babbf0b243e"}, + {file = "twisted_iocpsupport-1.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:0058c963c8957bcd3deda62122e89953c9de1e867a274facc9b15dde1a9f31e8"}, + {file = "twisted_iocpsupport-1.0.4-cp311-cp311-win32.whl", hash = "sha256:196f7c7ccad4ba4d1783b1c4e1d1b22d93c04275cd780bf7498d16c77319ad6e"}, + {file = "twisted_iocpsupport-1.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:4e5f97bcbabdd79cbaa969b63439b89801ea560f11d42b0a387634275c633623"}, + {file = "twisted_iocpsupport-1.0.4-cp312-cp312-win32.whl", hash = "sha256:6081bd7c2f4fcf9b383dcdb3b3385d75a26a7c9d2be25b6950c3d8ea652d2d2d"}, + {file = "twisted_iocpsupport-1.0.4-cp312-cp312-win_amd64.whl", hash = "sha256:76f7e67cec1f1d097d1f4ed7de41be3d74546e1a4ede0c7d56e775c4dce5dfb0"}, + {file = "twisted_iocpsupport-1.0.4-cp36-cp36m-win32.whl", hash = "sha256:3d306fc4d88a6bcf61ce9d572c738b918578121bfd72891625fab314549024b5"}, + {file = "twisted_iocpsupport-1.0.4-cp36-cp36m-win_amd64.whl", hash = "sha256:391ac4d6002a80e15f35adc4ad6056f4fe1c17ceb0d1f98ba01b0f4f917adfd7"}, + {file = "twisted_iocpsupport-1.0.4-cp37-cp37m-win32.whl", hash = "sha256:0c1b5cf37f0b2d96cc3c9bc86fff16613b9f5d0ca565c96cf1f1fb8cfca4b81c"}, + {file = "twisted_iocpsupport-1.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:3c5dc11d72519e55f727320e3cee535feedfaee09c0f0765ed1ca7badff1ab3c"}, + {file = "twisted_iocpsupport-1.0.4-cp38-cp38-win32.whl", hash = "sha256:cc86c2ef598c15d824a243c2541c29459881c67fc3c0adb6efe2242f8f0ec3af"}, + {file = "twisted_iocpsupport-1.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:c27985e949b9b1a1fb4c20c71d315c10ea0f93fdf3ccdd4a8c158b5926edd8c8"}, + {file = "twisted_iocpsupport-1.0.4-cp39-cp39-win32.whl", hash = "sha256:e311dfcb470696e3c077249615893cada598e62fa7c4e4ca090167bd2b7d331f"}, + {file = "twisted_iocpsupport-1.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:4574eef1f3bb81501fb02f911298af3c02fe8179c31a33b361dd49180c3e644d"}, + {file = "twisted_iocpsupport-1.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:872747a3b64e2909aee59c803ccd0bceb9b75bf27915520ebd32d69687040fa2"}, + {file = "twisted_iocpsupport-1.0.4-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:c2712b778bacf1db434e3e065adfed3db300754186a29aecac1efae9ef4bcaff"}, + {file = "twisted_iocpsupport-1.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:7c66fa0aa4236b27b3c61cb488662d85dae746a6d1c7b0d91cf7aae118445adf"}, + {file = "twisted_iocpsupport-1.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:300437af17396a945a58dcfffd77863303a8b6d9e65c6e81f1d2eed55b50d444"}, +] + [[package]] name = "types-pytz" version = "2023.3.1.1" @@ -5373,6 +5734,17 @@ files = [ docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] +[[package]] +name = "w3lib" +version = "2.1.2" +description = "Library of web-related functions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "w3lib-2.1.2-py3-none-any.whl", hash = "sha256:c4432926e739caa8e3f49f5de783f336df563d9490416aebd5d39fb896d264e7"}, + {file = "w3lib-2.1.2.tar.gz", hash = "sha256:ed5b74e997eea2abe3c1321f916e344144ee8e9072a6f33463ee8e57f858a4b1"}, +] + [[package]] name = "watchfiles" version = "0.21.0" @@ -5815,6 +6187,59 @@ files = [ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] +[[package]] +name = "zope-interface" +version = "6.2" +description = "Interfaces for Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "zope.interface-6.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:506f5410b36e5ba494136d9fa04c548eaf1a0d9c442b0b0e7a0944db7620e0ab"}, + {file = "zope.interface-6.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b386b8b9d2b6a5e1e4eadd4e62335571244cb9193b7328c2b6e38b64cfda4f0e"}, + {file = "zope.interface-6.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb0b3f2cb606981c7432f690db23506b1db5899620ad274e29dbbbdd740e797"}, + {file = "zope.interface-6.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de7916380abaef4bb4891740879b1afcba2045aee51799dfd6d6ca9bdc71f35f"}, + {file = "zope.interface-6.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b240883fb43160574f8f738e6d09ddbdbf8fa3e8cea051603d9edfd947d9328"}, + {file = "zope.interface-6.2-cp310-cp310-win_amd64.whl", hash = "sha256:8af82afc5998e1f307d5e72712526dba07403c73a9e287d906a8aa2b1f2e33dd"}, + {file = "zope.interface-6.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4d45d2ba8195850e3e829f1f0016066a122bfa362cc9dc212527fc3d51369037"}, + {file = "zope.interface-6.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:76e0531d86523be7a46e15d379b0e975a9db84316617c0efe4af8338dc45b80c"}, + {file = "zope.interface-6.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59f7374769b326a217d0b2366f1c176a45a4ff21e8f7cebb3b4a3537077eff85"}, + {file = "zope.interface-6.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25e0af9663eeac6b61b231b43c52293c2cb7f0c232d914bdcbfd3e3bd5c182ad"}, + {file = "zope.interface-6.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e02a6fc1772b458ebb6be1c276528b362041217b9ca37e52ecea2cbdce9fac"}, + {file = "zope.interface-6.2-cp311-cp311-win_amd64.whl", hash = "sha256:02adbab560683c4eca3789cc0ac487dcc5f5a81cc48695ec247f00803cafe2fe"}, + {file = "zope.interface-6.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8f5d2c39f3283e461de3655e03faf10e4742bb87387113f787a7724f32db1e48"}, + {file = "zope.interface-6.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:75d2ec3d9b401df759b87bc9e19d1b24db73083147089b43ae748aefa63067ef"}, + {file = "zope.interface-6.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa994e8937e8ccc7e87395b7b35092818905cf27c651e3ff3e7f29729f5ce3ce"}, + {file = "zope.interface-6.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ede888382882f07b9e4cd942255921ffd9f2901684198b88e247c7eabd27a000"}, + {file = "zope.interface-6.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2606955a06c6852a6cff4abeca38346ed01e83f11e960caa9a821b3626a4467b"}, + {file = "zope.interface-6.2-cp312-cp312-win_amd64.whl", hash = "sha256:ac7c2046d907e3b4e2605a130d162b1b783c170292a11216479bb1deb7cadebe"}, + {file = "zope.interface-6.2-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:febceb04ee7dd2aef08c2ff3d6f8a07de3052fc90137c507b0ede3ea80c21440"}, + {file = "zope.interface-6.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fc711acc4a1c702ca931fdbf7bf7c86f2a27d564c85c4964772dadf0e3c52f5"}, + {file = "zope.interface-6.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:396f5c94654301819a7f3a702c5830f0ea7468d7b154d124ceac823e2419d000"}, + {file = "zope.interface-6.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dd374927c00764fcd6fe1046bea243ebdf403fba97a937493ae4be2c8912c2b"}, + {file = "zope.interface-6.2-cp37-cp37m-win_amd64.whl", hash = "sha256:a3046e8ab29b590d723821d0785598e0b2e32b636a0272a38409be43e3ae0550"}, + {file = "zope.interface-6.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:de125151a53ecdb39df3cb3deb9951ed834dd6a110a9e795d985b10bb6db4532"}, + {file = "zope.interface-6.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f444de0565db46d26c9fa931ca14f497900a295bd5eba480fc3fad25af8c763e"}, + {file = "zope.interface-6.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2fefad268ff5c5b314794e27e359e48aeb9c8bb2cbb5748a071757a56f6bb8f"}, + {file = "zope.interface-6.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97785604824981ec8c81850dd25c8071d5ce04717a34296eeac771231fbdd5cd"}, + {file = "zope.interface-6.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7b2bed4eea047a949296e618552d3fed00632dc1b795ee430289bdd0e3717f3"}, + {file = "zope.interface-6.2-cp38-cp38-win_amd64.whl", hash = "sha256:d54f66c511ea01b9ef1d1a57420a93fbb9d48a08ec239f7d9c581092033156d0"}, + {file = "zope.interface-6.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5ee9789a20b0081dc469f65ff6c5007e67a940d5541419ca03ef20c6213dd099"}, + {file = "zope.interface-6.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:af27b3fe5b6bf9cd01b8e1c5ddea0a0d0a1b8c37dc1c7452f1e90bf817539c6d"}, + {file = "zope.interface-6.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4bce517b85f5debe07b186fc7102b332676760f2e0c92b7185dd49c138734b70"}, + {file = "zope.interface-6.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ae9793f114cee5c464cc0b821ae4d36e1eba961542c6086f391a61aee167b6f"}, + {file = "zope.interface-6.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e87698e2fea5ca2f0a99dff0a64ce8110ea857b640de536c76d92aaa2a91ff3a"}, + {file = "zope.interface-6.2-cp39-cp39-win_amd64.whl", hash = "sha256:b66335bbdbb4c004c25ae01cc4a54fd199afbc1fd164233813c6d3c2293bb7e1"}, + {file = "zope.interface-6.2.tar.gz", hash = "sha256:3b6c62813c63c543a06394a636978b22dffa8c5410affc9331ce6cdb5bfa8565"}, +] + +[package.dependencies] +setuptools = "*" + +[package.extras] +docs = ["Sphinx", "repoze.sphinx.autointerface", "sphinx_rtd_theme"] +test = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] +testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] + [[package]] name = "zstandard" version = "0.21.0" @@ -5876,4 +6301,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "a7590f6bd58efbc59ab986052bff21f235bb54351d4313d2fa9f99388f5b210c" +content-hash = "5f6fe2de1fa706b8e2ee0a3c07618649866146b4814d17d3fedf6e361007565f" diff --git a/pyproject.toml b/pyproject.toml index fd760c655..b90e1eaa3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,9 @@ black = "^23.3.0" pypdf2 = "^3.0.1" greenlet = "<3.0.0" confluent-kafka = "^2.3.0" +pytest-mock = "^3.12.0" +twisted = "22.10.0" +pytest-forked = "^1.6.0" [tool.poetry.group.sql_database.dependencies] sqlalchemy = ">=1.4" @@ -80,6 +83,11 @@ pyairtable = "^2.1.0.post1" [tool.poetry.group.filesystem.dependencies] adlfs = ">=2023.9.0" + +[tool.poetry.group.scrapy.dependencies] +scrapy = "^2.11.0" +twisted = "22.10.0" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/sources/.dlt/config.toml b/sources/.dlt/config.toml index 7e7efd452..aaf0732d8 100644 --- a/sources/.dlt/config.toml +++ b/sources/.dlt/config.toml @@ -60,3 +60,12 @@ access_token_expires_at=1688821881 [sources.workable] subdomain="dlthub-test" + +[sources.scraping] +batch_size = 100 +queue_size = 3000 +queue_result_timeout = 3.0 +start_urls_file="/path/to/urls.txt" +start_urls = [ + "https://quotes.toscrape.com/page/1/" +] diff --git a/sources/bing_webmaster/README.md b/sources/bing_webmaster/README.md index 0f10289d9..2451580ae 100644 --- a/sources/bing_webmaster/README.md +++ b/sources/bing_webmaster/README.md @@ -57,7 +57,7 @@ api_key = "Please set me up!" # please set me up! 3. Now the pipeline can be run by using the command: ```bash - python3 bing_webmaster_pipeline.py + python bing_webmaster_pipeline.py ``` 3. To make sure that everything is loaded as expected, use the command: diff --git a/sources/google_sheets/helpers/data_processing.py b/sources/google_sheets/helpers/data_processing.py index 85311cb01..ca0ae4206 100644 --- a/sources/google_sheets/helpers/data_processing.py +++ b/sources/google_sheets/helpers/data_processing.py @@ -224,7 +224,7 @@ def serial_date_to_datetime( ) # int values are dates, float values are datetimes if data_type == "date": - return conv_datetime.date() # type: ignore + return conv_datetime.date() # type: ignore[no-any-return] return conv_datetime diff --git a/sources/kinesis/helpers.py b/sources/kinesis/helpers.py index 65552e5b8..990dd3771 100644 --- a/sources/kinesis/helpers.py +++ b/sources/kinesis/helpers.py @@ -2,7 +2,7 @@ import dlt from dlt.common import pendulum -from dlt.common.typing import DictStrStr, StrStr +from dlt.common.typing import DictStrAny, StrAny, StrStr def get_shard_iterator( @@ -11,7 +11,7 @@ def get_shard_iterator( shard_id: str, last_msg: dlt.sources.incremental[StrStr], initial_at_timestamp: pendulum.DateTime, -) -> Tuple[str, StrStr]: +) -> Tuple[str, StrAny]: """Gets shard `shard_id` of `stream_name` iterator. If `last_msg` incremental is present it may contain last message sequence for shard_id. in that case AFTER_SEQUENCE_NUMBER is created. If no message sequence is present, `initial_at_timestamp` is used for AT_TIMESTAMP or LATEST. @@ -20,7 +20,7 @@ def get_shard_iterator( sequence_state = ( {} if last_msg is None else last_msg.last_value or last_msg.initial_value or {} ) - iterator_params: DictStrStr + iterator_params: DictStrAny msg_sequence = sequence_state.get(shard_id, None) if msg_sequence: iterator_params = dict( diff --git a/sources/scraping/README.md b/sources/scraping/README.md new file mode 100644 index 000000000..37aa6100d --- /dev/null +++ b/sources/scraping/README.md @@ -0,0 +1,112 @@ +--- +title: Scraping with DLT +description: dlt source to scrape web content +keywords: [scrapy, scraping, spiders, crawler, crawling] +--- + +# Scraping + +Scraping source allows you to scrape content from web and uses [Scrapy](https://doc.scrapy.org/en/latest/) +to enable this capability. + +It is possible to access and manipulate a scraping resource via (please see `scraping_pipeline.py`) + +1. `on_before_start` callback which will receive a `DltResource` as the only argument, +2. The advanced scraping pipeline builder `scraping.helpers.create_pipeline_runner` + +## Initialize the pipeline + +```bash +dlt init scraping duckdb +``` + +## 🎲 Configuration + +It is possible to provide configuration via `.dlt/config.toml` below you can see an example + +```toml +[sources.scraping] +# Batch size - how many scraped results to collect +# before dispatching to DLT pipeline +batch_size = 100 +# Defaul queue size +queue_size = 3000 +# How log to wait before exiting +queue_result_timeout = 3.0 +start_urls = [ + "https://quotes.toscrape.com/page/1/" +] +start_urls_file="/path/to/urls.txt" +``` + +When both `start_urls` and `start_urls_file` they will be merged and deduplicated so Scrapy +gets a unique set of `start_urls`. + +## 🏎️ Running the pipeline + +Install requirements and run the pipeline + +```sh +pip install -r requirements.txt +python scraping_pipeline.py +``` + +## Implementing a spider + +It is your responsibility to implement the spider and data extraction logic from the responses +because our runner expects spider class, please see as a reference an example of spider in `scraping_pipeline.py`. +For more information about spider implementation please also see [Scrapy docs](https://docs.scrapy.org/en/latest/topics/spiders.html). + +## Configuring Scrapy + +You can pass scrapy settings via + +1. `run_pipeline(..., scrapy_settings={...})`, +2. `create_pipeline_runner(..., scrapy_settings={...})`, +3. Overriding defaults in `settings.py`. + +Example: +```py +run_pipeline( + pipeline, + MySpider, + scrapy_settings={ + # How many sub pages to scrape + # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit + "DEPTH_LIMIT": 0, + "SPIDER_MIDDLEWARES": { + "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, + }, + "HTTPERROR_ALLOW_ALL": True, + }, +) +``` + +Note: this is just a shallow merge. +Also log level is automatically set in sync with the one +dlt provides so providing it via `scrapy_settings` as `"LOG_LEVEL": "DEBUG"` will not work, +please see [logging documentation](https://dlthub.com/docs/running-in-production/running#set-the-log-level-and-format) for dlt. + +## 🧐 Introspection using streamlit + +NOTE: you might need to set up `streamlit`, `pip install streamlit` + +```sh +dlt pipeline show +``` + +## 🧠 How it works? + +Under the hood we run DLT [pipeline](https://dlthub.com/docs/api_reference/pipeline) in a separate thread while scrapy is running in the main thread. + +Communication between the two is done via the queue, where + +- Spider is responsible to put the results in the queue, +- DLT resource collects and batches results from the queue. + +![simple diagram](./diagram.png) + +

Enjoy it!

+
+

✨ 🚀 ✨

diff --git a/sources/scraping/__init__.py b/sources/scraping/__init__.py new file mode 100644 index 000000000..864eb00cd --- /dev/null +++ b/sources/scraping/__init__.py @@ -0,0 +1,74 @@ +"""Scraping source + +Integrates Dlt and Scrapy to facilitate scraping pipelines. +""" +import inspect +import typing as t + +import dlt + +from dlt.sources import DltResource +from dlt.common.source import _SOURCES, SourceInfo + +from scrapy import Spider # type: ignore + +from .helpers import ScrapingConfig, create_pipeline_runner +from .types import P, AnyDict + + +def run_pipeline( # type: ignore[valid-type] + pipeline: dlt.Pipeline, + spider: t.Type[Spider], + *args: P.args, + on_before_start: t.Callable[[DltResource], None] = None, + scrapy_settings: t.Optional[AnyDict] = None, + batch_size: t.Optional[int] = None, + queue_size: t.Optional[int] = None, + queue_result_timeout: t.Optional[float] = None, + **kwargs: P.kwargs, +) -> None: + """Simple runner for the scraping pipeline + + You can pass all parameters via kwargs to `dlt.pipeline.run(....)` + + ``` + destination: TDestinationReferenceArg = None, + staging: TDestinationReferenceArg = None, + dataset_name: str = None, + credentials: Any = None, + table_name: str = None, + write_disposition: TWriteDisposition = None, + columns: TAnySchemaColumns = None, + primary_key: TColumnNames = None, + schema: Schema = None, + loader_file_format: TLoaderFileFormat = None + ``` + """ + options: AnyDict = {} + if scrapy_settings: + options["scrapy_settings"] = scrapy_settings + + if batch_size: + options["batch_size"] = batch_size + + if queue_size: + options["queue_size"] = queue_size + + if queue_result_timeout: + options["queue_result_timeout"] = queue_result_timeout + + scraping_host = create_pipeline_runner(pipeline, spider, **options) + + if on_before_start: + on_before_start(scraping_host.pipeline_runner.scraping_resource) + + scraping_host.run(*args, **kwargs) + + +# This way we allow dlt init to detect scraping source it is indeed hacky +# and the core team is working to provide a better alternative. +_SOURCES[run_pipeline.__qualname__] = SourceInfo( + ScrapingConfig, + run_pipeline, + inspect.getmodule(run_pipeline), +) diff --git a/sources/scraping/diagram.png b/sources/scraping/diagram.png new file mode 100644 index 000000000..eebf6f7fd Binary files /dev/null and b/sources/scraping/diagram.png differ diff --git a/sources/scraping/helpers.py b/sources/scraping/helpers.py new file mode 100644 index 000000000..ad61dae83 --- /dev/null +++ b/sources/scraping/helpers.py @@ -0,0 +1,103 @@ +import os +import typing as t + +import dlt +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs.base_configuration import ( + configspec, + BaseConfiguration, +) + +from scrapy import Spider # type: ignore + +from .queue import ScrapingQueue +from .settings import SOURCE_SCRAPY_QUEUE_SIZE, SOURCE_SCRAPY_SETTINGS +from .runner import ScrapingHost, PipelineRunner, ScrapyRunner, Signals +from .types import AnyDict + + +@configspec +class ScrapingConfig(BaseConfiguration): + # Batch size for scraped items + batch_size: int = 100 + + # maxsize for queue + queue_size: t.Optional[int] = SOURCE_SCRAPY_QUEUE_SIZE + + # result wait timeout for our queue + queue_result_timeout: t.Optional[float] = 1.0 + + # List of start urls + start_urls: t.List[str] = None + start_urls_file: str = None + + +@with_config(sections=("sources", "scraping"), spec=ScrapingConfig) +def resolve_start_urls( + start_urls: t.Optional[t.List[str]] = dlt.config.value, + start_urls_file: t.Optional[str] = dlt.config.value, +) -> t.List[str]: + """Merges start urls + If both `start_urls` and `start_urls_file` given, we will merge them + and return deduplicated list of `start_urls` for scrapy spider. + """ + urls = set() + if os.path.exists(start_urls_file): + with open(start_urls_file, encoding="utf-8") as fp: + urls = {line for line in fp.readlines() if str(line).strip()} + + if start_urls: + for url in start_urls: + urls.add(url) + + return list(set(urls)) + + +@with_config(sections=("sources", "scraping"), spec=ScrapingConfig) +def create_pipeline_runner( + pipeline: dlt.Pipeline, + spider: t.Type[Spider], + batch_size: int = dlt.config.value, + queue_size: int = dlt.config.value, + queue_result_timeout: float = dlt.config.value, + scrapy_settings: t.Optional[AnyDict] = None, +) -> ScrapingHost: + """Creates scraping host instance + This helper only creates pipeline host, so running and controlling + scrapy runner and pipeline is completely delegated to advanced users + """ + queue = ScrapingQueue( # type: ignore + maxsize=queue_size, + batch_size=batch_size, + read_timeout=queue_result_timeout, + ) + + signals = Signals( + pipeline_name=pipeline.pipeline_name, + queue=queue, + ) + + # Just to simple merge + settings = {**SOURCE_SCRAPY_SETTINGS} + if scrapy_settings: + settings = {**scrapy_settings} + + scrapy_runner = ScrapyRunner( + spider=spider, + start_urls=resolve_start_urls(), + signals=signals, + settings=settings, + ) + + pipeline_runner = PipelineRunner( + pipeline=pipeline, + queue=queue, + ) + + scraping_host = ScrapingHost( + queue, + scrapy_runner, + pipeline_runner, + ) + + return scraping_host diff --git a/sources/scraping/queue.py b/sources/scraping/queue.py new file mode 100644 index 000000000..2e02bd602 --- /dev/null +++ b/sources/scraping/queue.py @@ -0,0 +1,89 @@ +import typing as t +from queue import Empty, Queue + +from dlt.common import logger + + +# Please read more at https://mypy.readthedocs.io/en/stable/runtime_troubles.html#not-generic-runtime +T = t.TypeVar("T") + +if t.TYPE_CHECKING: + + class _Queue(Queue[T]): + pass + +else: + + class _Queue(Queue, t.Generic[T]): + pass + + +class QueueClosedError(Exception): + pass + + +class ScrapingQueue(_Queue[T]): + def __init__( + self, + maxsize: int = 0, + batch_size: int = 10, + read_timeout: float = 1.0, + ) -> None: + super().__init__(maxsize) + self.batch_size = batch_size + self.read_timeout = read_timeout + self._is_closed = False + + def get_batches(self) -> t.Iterator[t.Any]: + """Batching helper can be wrapped as a dlt.resource + + Returns: + Iterator[Any]: yields scraped items one by one + """ + batch: t.List[T] = [] + while True: + if len(batch) == self.batch_size: + yield batch + batch = [] + + try: + if self.is_closed: + raise QueueClosedError("Queue is closed") + + item = self.get(timeout=self.read_timeout) + batch.append(item) + + # Mark task as completed + self.task_done() + except Empty: + if batch: + yield batch + batch = [] + except QueueClosedError: + logger.info("Queue is closed, stopping...") + + # Return the last batch before exiting + if batch: + yield batch + + break + + def stream(self) -> t.Iterator[t.Any]: + """Streaming generator, wraps get_batches + and handles `GeneratorExit` if dlt closes it. + + Returns: + t.Iterator[t.Any]: returns batches of scraped content + """ + try: + yield from self.get_batches() + except GeneratorExit: + self.close() + + def close(self) -> None: + """Marks queue as closed""" + self._is_closed = True + + @property + def is_closed(self) -> bool: + return self._is_closed diff --git a/sources/scraping/requirements.txt b/sources/scraping/requirements.txt new file mode 100644 index 000000000..1915446e9 --- /dev/null +++ b/sources/scraping/requirements.txt @@ -0,0 +1,2 @@ +dlt>=0.4.5 +scrapy>=2.11.0 \ No newline at end of file diff --git a/sources/scraping/runner.py b/sources/scraping/runner.py new file mode 100644 index 000000000..7203db63a --- /dev/null +++ b/sources/scraping/runner.py @@ -0,0 +1,195 @@ +"""This module contains abstractions to facilitate scraping and loading process""" +import threading +import typing as t +import dlt + +from dlt.common import logger +from pydispatch import dispatcher # type: ignore +from typing_extensions import Self + +from scrapy import signals, Item, Spider # type: ignore +from scrapy.crawler import CrawlerProcess # type: ignore + +from .types import AnyDict, Runnable, P +from .queue import ScrapingQueue + +T = t.TypeVar("T") + + +class Signals: + """Signals context wrapper + + This wrapper is also a callable which accepts `CrawlerProcess` instance + this is required to stop the scraping process as soon as the queue closes. + """ + + def __init__(self, pipeline_name: str, queue: ScrapingQueue[T]) -> None: + self.stopping = False + self.queue = queue + self.pipeline_name = pipeline_name + + def on_item_scraped(self, item: Item) -> None: + if not self.queue.is_closed: + self.queue.put(item) + else: + logger.info( + "Queue is closed, stopping", + extra={"pipeline_name": self.pipeline_name}, + ) + if not self.stopping: + self.on_engine_stopped() + + def on_engine_stopped(self) -> None: + logger.info(f"Crawling engine stopped for pipeline={self.pipeline_name}") + self.stopping = True + self.crawler.stop() + self.queue.close() + self.queue.join() + + def __call__(self, crawler: CrawlerProcess) -> Self: + self.crawler = crawler + return self + + def __enter__(self) -> None: + # We want to receive on_item_scraped callback from + # outside so we don't have to know about any queue instance. + dispatcher.connect(self.on_item_scraped, signals.item_scraped) + + # Once crawling engine stops we would like to know about it as well. + dispatcher.connect(self.on_engine_stopped, signals.engine_stopped) + + def __exit__(self, exc_type: t.Any, exc_val: t.Any, exc_tb: t.Any) -> None: + dispatcher.disconnect(self.on_item_scraped, signals.item_scraped) + dispatcher.disconnect(self.on_engine_stopped, signals.engine_stopped) + + +class ScrapyRunner(Runnable): + """Scrapy runner handles setup and teardown of scrapy crawling""" + + def __init__( + self, + spider: t.Type[Spider], + start_urls: t.List[str], + settings: AnyDict, + signals: Signals, + ) -> None: + self.spider = spider + self.start_urls = start_urls + self.crawler = CrawlerProcess(settings=settings) + self.signals = signals + + def run(self, *args: P.args, **kwargs: P.kwargs) -> None: + """Runs scrapy crawler process + + All `kwargs` are forwarded to `crawler.crawl(**kwargs)`. + Also manages relevant signal handling in proper way. + """ + self.crawler.crawl( + self.spider, + name="scraping_spider", + start_urls=self.start_urls, + **kwargs, + ) + + try: + logger.info("Starting the crawler") + with self.signals(self.crawler): + self.crawler.start() + except Exception: + logger.error("Was unable to start crawling process") + raise + finally: + self.signals.on_engine_stopped() + logger.info("Scraping stopped") + + +class PipelineRunner(Runnable): + """Pipeline runner runs dlt pipeline in a separate thread + Since scrapy wants to run in the main thread it is the only available + option to host pipeline in a thread and communicate via the queue. + """ + + def __init__(self, pipeline: dlt.Pipeline, queue: ScrapingQueue[T]) -> None: + self.pipeline = pipeline + self.queue = queue + + if pipeline.dataset_name and not self.is_default_dataset_name(pipeline): + resource_name = pipeline.dataset_name + else: + resource_name = f"{pipeline.pipeline_name}_results" + + logger.info(f"Resource name: {resource_name}") + + self.scraping_resource = dlt.resource( + # Queue get_batches is a generator so we can + # pass it to pipeline.run and dlt will handle the rest. + self.queue.stream(), + name=resource_name, + ) + + def is_default_dataset_name(self, pipeline: dlt.Pipeline) -> bool: + default_name = pipeline.pipeline_name + pipeline.DEFAULT_DATASET_SUFFIX + return pipeline.dataset_name == default_name + + def run( + self, + *args: P.args, + **kwargs: P.kwargs, + ) -> threading.Thread: + """You can use all regular dlt.pipeline.run() arguments + + ``` + destination: TDestinationReferenceArg = None, + staging: TDestinationReferenceArg = None, + dataset_name: str = None, + credentials: Any = None, + table_name: str = None, + write_disposition: TWriteDisposition = None, + columns: TAnySchemaColumns = None, + primary_key: TColumnNames = None, + schema: Schema = None, + loader_file_format: TLoaderFileFormat = None + ``` + """ + + def run() -> None: + try: + self.pipeline.run(self.scraping_resource, **kwargs) # type: ignore[arg-type] + except Exception: + logger.error("Error during pipeline.run call, closing the queue") + raise + finally: + self.queue.close() + + thread_runner = threading.Thread(target=run) + thread_runner.start() + return thread_runner + + +class ScrapingHost: + """Scraping host runs the pipeline and scrapy""" + + def __init__( + self, + queue: ScrapingQueue[T], + scrapy_runner: ScrapyRunner, + pipeline_runner: PipelineRunner, + ) -> None: + self.queue = queue + self.scrapy_runner = scrapy_runner + self.pipeline_runner = pipeline_runner + + def run( + self, + *args: P.args, + **kwargs: P.kwargs, + ) -> None: + """You can pass kwargs which are passed to `pipeline.run`""" + logger.info("Starting pipeline") + pipeline_worker = self.pipeline_runner.run(*args, **kwargs) + + logger.info("Starting scrapy crawler") + self.scrapy_runner.run() + + # Wait to for pipeline finish it's job + pipeline_worker.join() diff --git a/sources/scraping/settings.py b/sources/scraping/settings.py new file mode 100644 index 000000000..e722dafdf --- /dev/null +++ b/sources/scraping/settings.py @@ -0,0 +1,29 @@ +from .types import AnyDict + +SOURCE_BATCH_SIZE: int = 10 +SOURCE_SCRAPY_QUEUE_SIZE: int = 3000 +SOURCE_SCRAPY_QUEUE_RESULT_TIMEOUT: int = 5 +SOURCE_SCRAPY_SETTINGS: AnyDict = { + "LOG_LEVEL": "INFO", + # If not set then will keep logging warning in the console + # https://docs.scrapy.org/en/latest/topics/request-response.html#request-fingerprinter-implementation + "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", + "TELNETCONSOLE_ENABLED": False, + # How many sub pages to scrape + # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit + "DEPTH_LIMIT": 0, + "SPIDER_MIDDLEWARES": { + "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, + }, + "HTTPERROR_ALLOW_ALL": True, + "FAKEUSERAGENT_PROVIDERS": [ + # this is the first provider we'll try + "scrapy_fake_useragent.providers.FakeUserAgentProvider", + # if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us + "scrapy_fake_useragent.providers.FakerProvider", + # fall back to USER_AGENT value + "scrapy_fake_useragent.providers.FixedUserAgentProvider", + ], + "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0", +} diff --git a/sources/scraping/types.py b/sources/scraping/types.py new file mode 100644 index 000000000..7451d577c --- /dev/null +++ b/sources/scraping/types.py @@ -0,0 +1,12 @@ +import typing as t + +from typing_extensions import ParamSpec + +AnyDict = t.Dict[str, t.Any] + +P = ParamSpec("P") + + +class Runnable(t.Protocol): + def run(self, *args: P.args, **kwargs: P.kwargs) -> t.Any: + pass diff --git a/sources/scraping_pipeline.py b/sources/scraping_pipeline.py new file mode 100644 index 000000000..82eae54b1 --- /dev/null +++ b/sources/scraping_pipeline.py @@ -0,0 +1,106 @@ +from typing import Any + +import dlt +from dlt.sources import DltResource +from scrapy import Spider # type: ignore +from scrapy.http import Response # type: ignore + +from scraping import run_pipeline +from scraping.helpers import create_pipeline_runner + + +class MySpider(Spider): + def parse(self, response: Response, **kwargs: Any) -> Any: + for next_page in response.css("li.next a::attr(href)"): + if next_page: + yield response.follow(next_page, self.parse) + + for quote in response.css("div.quote"): + result = { + "quote": { + "text": quote.css("span.text::text").get(), + "author": quote.css("small.author::text").get(), + "tags": quote.css("div.tags a.tag::text").getall(), + }, + } + yield result + + +def scrape_quotes() -> None: + pipeline = dlt.pipeline( + pipeline_name="scraping", + destination="duckdb", + dataset_name="quotes", + ) + + run_pipeline( + pipeline, + MySpider, + # you can pass scrapy settings overrides here + scrapy_settings={ + "DEPTH_LIMIT": 10, + }, + write_disposition="append", + ) + + +def scrape_quotes_scrapy_configs() -> None: + pipeline = dlt.pipeline( + pipeline_name="scraping_custom_scrapy_configs", + destination="duckdb", + dataset_name="quotes", + ) + + run_pipeline( + pipeline, + MySpider, + # you can pass scrapy settings overrides here + scrapy_settings={ + # How many sub pages to scrape + # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit + "DEPTH_LIMIT": 100, + "SPIDER_MIDDLEWARES": { + "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, + }, + "HTTPERROR_ALLOW_ALL": False, + }, + write_disposition="append", + ) + + +def scrape_quotes_callback_access_resource() -> None: + pipeline = dlt.pipeline( + pipeline_name="scraping_resource_callback", + destination="duckdb", + dataset_name="quotes", + ) + + def on_before_start(res: DltResource) -> None: + res.add_limit(2) + + run_pipeline( + pipeline, + MySpider, + batch_size=10, + scrapy_settings={}, + on_before_start=on_before_start, + write_disposition="append", + ) + + +def scrape_quotes_advanced_runner() -> None: + pipeline = dlt.pipeline( + pipeline_name="scraping_advanced_direct", + destination="duckdb", + ) + scraping_host = create_pipeline_runner(pipeline, MySpider, batch_size=10) + scraping_host.pipeline_runner.scraping_resource.add_limit(2) + scraping_host.run(dataset_name="quotes", write_disposition="append") + + +if __name__ == "__main__": + scrape_quotes() + # scrape_quotes_scrapy_configs() + # scrape_quotes_callback_access_resource() + # scrape_quotes_advanced_runner() diff --git a/tests/scraping/__init__.py b/tests/scraping/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/scraping/test_scraping_source.py b/tests/scraping/test_scraping_source.py new file mode 100644 index 000000000..9c8452cd0 --- /dev/null +++ b/tests/scraping/test_scraping_source.py @@ -0,0 +1,151 @@ +from unittest import mock + +import dlt +import pytest + +from twisted.internet import reactor + +from sources.scraping import run_pipeline +from sources.scraping.helpers import create_pipeline_runner +from sources.scraping.queue import ScrapingQueue +from sources.scraping.runner import PipelineRunner + +from tests.utils import ALL_DESTINATIONS, load_table_counts + +from .utils import ( + MySpider, + TestCrawlerProcess, + TestQueue, + queue_closer, + table_expect_at_least_n_records, +) + + +@pytest.fixture(scope="module") +def shutdown_reactor(): + yield True + + try: + reactor.stop() + except Exception: # noqa + pass + + +def test_scrapy_resource_yields_last_batch_on_queue_get_timeout(): + queue = ScrapingQueue(read_timeout=1.0, batch_size=5) + queue.put({"n": 1}) + items = next(queue.get_batches()) + assert len(items) == 1 + + +def test_scrapy_resource_yields_last_batch_if_queue_is_closed(): + queue = ScrapingQueue(read_timeout=1.0, batch_size=2) + queue.put({"n": 1}) + queue.put({"n": 2}) + queue.put({"n": 3}) + queue_closer(queue, close_after_seconds=0.1) + + items = list(queue.get_batches()) + assert len(items) == 2 + + +@pytest.mark.skip( + reason=( + "TOOD: Figure out why this test hangs on CI" + "when running locally it never hangs..." + ) +) +@mock.patch("sources.scraping.runner.CrawlerProcess", TestCrawlerProcess) +@mock.patch("sources.scraping.helpers.ScrapingQueue", TestQueue) +def test_pipeline_runners_handle_extended_and_simple_use_cases( + mocker, shutdown_reactor +): + pipeline = dlt.pipeline( + pipeline_name="scraping_res_add_limit", + destination="duckdb", + ) + + spy_on_queue_put = mocker.spy(TestQueue, "put") + spy_on_queue_close = mocker.spy(TestQueue, "close") + spy_on_crawler_process = mocker.spy(TestCrawlerProcess, "stop") + scraping_host = create_pipeline_runner(pipeline, MySpider, batch_size=10) + scraping_host.pipeline_runner.scraping_resource.add_limit(2) + scraping_host.run(dataset_name="quotes", write_disposition="append") + + table_expect_at_least_n_records("scraping_res_add_limit_results", 20, pipeline) + table_expect_at_least_n_records( + "scraping_res_add_limit_results__quote__tags", 68, pipeline + ) + + spy_on_queue_put.assert_called() + spy_on_queue_close.assert_called() + spy_on_crawler_process.assert_called() + + err_pipeline = dlt.pipeline( + pipeline_name="scraping_exc", + destination="duckdb", + dataset_name="quotes", + ) + + with mocker.patch("dlt.Pipeline.run", side_effect=OSError("bla")): + run_pipeline(err_pipeline, MySpider, dataset_name="quotes") + spy_on_queue_close.assert_called() + + +def test_resource_name_assignment_and_generation(): + queue = ScrapingQueue() + # If dataset_name is given to pipeline then we will have + # resource name same as dataset_name + pipeline1 = dlt.pipeline( + pipeline_name="pipeline_one", + destination="duckdb", + dataset_name="cookies", + ) + pipeline_runner = PipelineRunner( + pipeline=pipeline1, + queue=queue, + ) + assert pipeline_runner.scraping_resource.name == "cookies" + + # If datasert_name is not given to pipeline then we will have + # resource name is generate like pipeline.name + "_result" suffix + pipeline2 = dlt.pipeline(pipeline_name="pipeline_two", destination="duckdb") + pipeline_runner2 = PipelineRunner(pipeline2, queue=queue) + assert pipeline_runner2.scraping_resource.name == "pipeline_two_results" + + +@pytest.mark.skip( + reason=( + "This test should run in isolation and a new interpreter" + "for each parametrized destination" + ) +) +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@mock.patch("sources.scraping.runner.CrawlerProcess", TestCrawlerProcess) +def test_scraping_all_resources(destination_name: str, shutdown_reactor) -> None: + pipeline_name = f"scraping_forked_{destination_name}_results" + pipeline = dlt.pipeline( + pipeline_name=pipeline_name, + destination=destination_name, + dataset_name="quotes", + ) + + run_pipeline( + pipeline, + MySpider, + dataset_name="quotes", + write_disposition="append", + ) + + table_names = [t["name"] for t in pipeline.default_schema.data_tables()] + table_counts = load_table_counts(pipeline, *table_names) + + # for now only check main tables + expected_tables = { + pipeline_name, + "quotes__quote__tags", + } + assert set(table_counts.keys()) >= set(expected_tables) + + table_expect_at_least_n_records(pipeline_name, 100, pipeline) + table_expect_at_least_n_records("quotes__quote__tags", 232, pipeline) diff --git a/tests/scraping/utils.py b/tests/scraping/utils.py new file mode 100644 index 000000000..968d3eb82 --- /dev/null +++ b/tests/scraping/utils.py @@ -0,0 +1,127 @@ +from queue import Empty +from typing import Any, Iterator, List, Type, Union +import time +import threading + +import dlt + +from scrapy import Spider # type: ignore +from scrapy.crawler import Crawler, CrawlerRunner # type: ignore +from scrapy.http import Response # type: ignore +from twisted.internet import reactor +from twisted.internet.defer import Deferred + +from sources.scraping.queue import QueueClosedError, ScrapingQueue + + +class MySpider(Spider): + def parse(self, response: Response, **kwargs: Any) -> Any: + for next_page in response.css("li.next a::attr(href)"): + if next_page: + yield response.follow(next_page, self.parse) + + for quote in response.css("div.quote"): + result = { + "quote": { + "text": quote.css("span.text::text").get(), + "author": quote.css("small.author::text").get(), + "tags": quote.css("div.tags a.tag::text").getall(), + }, + } + + yield result + + +class TestQueue(ScrapingQueue): + """Test queue alters the default get_batches behavior by + adding max attempts count on queue read timeout + """ + + def __init__( + self, maxsize: int = 0, batch_size: int = 10, read_timeout: float = 1.0 + ) -> None: + super().__init__(maxsize, batch_size, read_timeout) + self.max_empty_get_attempts = 5 + + def get_batches(self) -> Iterator[Any]: + batch: List = [] + get_attempts: int = 0 + while True: + if len(batch) == self.batch_size: + yield batch + batch = [] + + try: + if self.is_closed: + raise QueueClosedError("Queue is closed") + + item = self.get(timeout=self.read_timeout) + batch.append(item) + + # Mark task as completed + self.task_done() + except Empty: + if batch: + yield batch + batch = [] + + if get_attempts >= self.max_empty_get_attempts: + self.close() + break + + print("Get attempt #", get_attempts) + get_attempts += 1 + except QueueClosedError: + # Return the last batch before exiting + if batch: + yield batch + + break + + +class TestCrawlerProcess(CrawlerRunner): + def crawl( + self, + crawler_or_spidercls: Union[Type[Spider], str, Crawler], + *args: Any, + **kwargs: Any, + ) -> Deferred: + deferred = super().crawl(crawler_or_spidercls, *args, **kwargs) + deferred.addBoth(lambda _: reactor.stop()) + return deferred + + def start( + self, stop_after_crawl: bool = True, install_signal_handlers: bool = True + ) -> None: + try: + reactor.run() + except Exception: + pass + + +def queue_closer( + queue: ScrapingQueue, close_after_seconds: float = 1.0 +) -> threading.Thread: + def close_queue(): + slept: int = 0 + while True: + time.sleep(1) + slept += 1 + if queue.is_closed: + break + + if slept >= close_after_seconds: + queue.close() + break + + closer = threading.Thread(target=close_queue) + closer.start() + return closer + + +def table_expect_at_least_n_records(table_name: str, n: int, pipeline: dlt.Pipeline): + with pipeline.sql_client() as client: + with client.execute_query(f"SELECT * FROM {table_name}") as cursor: + loaded_values = [item for item in cursor.fetchall()] + n_loaded_values = len(loaded_values) + assert n_loaded_values == n, f"Expected {n} records, got {n_loaded_values}"