dlt-hub · rudolfix · Mar 5, 2024 · Jan 25, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,9 @@ black = "^23.3.0"
 pypdf2 = "^3.0.1"
 greenlet = "<3.0.0"
 confluent-kafka = "^2.3.0"
+pytest-mock = "^3.12.0"
+twisted = "22.10.0"
+pytest-forked = "^1.6.0"
 
 [tool.poetry.group.sql_database.dependencies]
 sqlalchemy = ">=1.4"
@@ -80,6 +83,11 @@ pyairtable = "^2.1.0.post1"
 [tool.poetry.group.filesystem.dependencies]
 adlfs = ">=2023.9.0"
 
+
+[tool.poetry.group.scrapy.dependencies]
+scrapy = "^2.11.0"
+twisted = "22.10.0"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

diff --git a/sources/.dlt/config.toml b/sources/.dlt/config.toml
@@ -60,3 +60,12 @@ access_token_expires_at=1688821881
 
 [sources.workable]
 subdomain="dlthub-test"
+
+[sources.scraping]
+batch_size = 100
+queue_size = 3000
+queue_result_timeout = 3.0
+start_urls_file="/path/to/urls.txt"
+start_urls = [
+    "https://quotes.toscrape.com/page/1/"
+]
diff --git a/sources/bing_webmaster/README.md b/sources/bing_webmaster/README.md
@@ -57,7 +57,7 @@ api_key = "Please set me up!" # please set me up!
 3. Now the pipeline can be run by using the command:
 
    ```bash
-   python3 bing_webmaster_pipeline.py
+   python bing_webmaster_pipeline.py
    ```
 
 3. To make sure that everything is loaded as expected, use the command:

diff --git a/sources/google_sheets/helpers/data_processing.py b/sources/google_sheets/helpers/data_processing.py
@@ -224,7 +224,7 @@ def serial_date_to_datetime(
     )
     # int values are dates, float values are datetimes
     if data_type == "date":
-        return conv_datetime.date()  # type: ignore
+        return conv_datetime.date()
 
     return conv_datetime
 

diff --git a/sources/kinesis/helpers.py b/sources/kinesis/helpers.py
@@ -2,7 +2,7 @@
 
 import dlt
 from dlt.common import pendulum
-from dlt.common.typing import DictStrStr, StrStr
+from dlt.common.typing import DictStrAny, StrAny, StrStr
 
 
 def get_shard_iterator(
@@ -11,7 +11,7 @@ def get_shard_iterator(
     shard_id: str,
     last_msg: dlt.sources.incremental[StrStr],
     initial_at_timestamp: pendulum.DateTime,
-) -> Tuple[str, StrStr]:
+) -> Tuple[str, StrAny]:
     """Gets shard `shard_id` of `stream_name` iterator. If `last_msg` incremental is present it may
     contain last message sequence for shard_id. in that case AFTER_SEQUENCE_NUMBER is created.
     If no message sequence is present, `initial_at_timestamp` is used for AT_TIMESTAMP or LATEST.
@@ -20,7 +20,7 @@ def get_shard_iterator(
     sequence_state = (
         {} if last_msg is None else last_msg.last_value or last_msg.initial_value or {}
     )
-    iterator_params: DictStrStr
+    iterator_params: DictStrAny
     msg_sequence = sequence_state.get(shard_id, None)
     if msg_sequence:
         iterator_params = dict(

diff --git a/sources/scraping/README.md b/sources/scraping/README.md
@@ -0,0 +1,112 @@
+---
+title: Scraping with DLT
+description: dlt source to scrape web content
+keywords: [scrapy, scraping, spiders, crawler, crawling]
+---
+
+# Scraping
+
+Scraping source allows you to scrape content from web and uses [Scrapy](https://doc.scrapy.org/en/latest/)
+to enable this capability.
+
+It is possible to access and manipulate a scraping resource via (please see `scraping_pipeline.py`)
+
+1. `on_before_start` callback which will receive a `DltResource` as the only argument,
+2. The advanced scraping pipeline builder `scraping.helpers.create_pipeline_runner`
+
+## Initialize the pipeline
+
+```bash
+dlt init scraping duckdb
+```
+
+## 🎲 Configuration
+
+It is possible to provide configuration via `.dlt/config.toml` below you can see an example
+
+```toml
+[sources.scraping]
+# Batch size - how many scraped results to collect
+# before dispatching to DLT pipeline
+batch_size = 100
+# Defaul queue size
+queue_size = 3000
+# How log to wait before exiting
+queue_result_timeout = 3.0
+start_urls = [
+    "https://quotes.toscrape.com/page/1/"
+]
+start_urls_file="/path/to/urls.txt"
+```
+
+When both `start_urls` and `start_urls_file` they will be merged and deduplicated so Scrapy
+gets a unique set of `start_urls`.
+
+## 🏎️ Running the pipeline
+
+Install requirements and run the pipeline
+
+```sh
+pip install -r requirements.txt
+python scraping_pipeline.py
+```
+
+## Implementing a spider
+
+It is your responsibility to implement the spider and data extraction logic from the responses
+because our runner expects spider class, please see as a reference an example of spider in `scraping_pipeline.py`.
+For more information about spider implementation please also see [Scrapy docs](https://docs.scrapy.org/en/latest/topics/spiders.html).
+
+## Configuring Scrapy
+
+You can pass scrapy settings via
+
+1. `run_pipeline(..., scrapy_settings={...})`,
+2. `create_pipeline_runner(..., scrapy_settings={...})`,
+3. Overriding defaults in `settings.py`.
+
+Example:
+```py
+run_pipeline(
+    pipeline,
+    MySpider,
+    scrapy_settings={
+        # How many sub pages to scrape
+        # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit
+        "DEPTH_LIMIT": 0,
+        "SPIDER_MIDDLEWARES": {
+            "scrapy.spidermiddlewares.depth.DepthMiddleware": 200,
+            "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300,
+        },
+        "HTTPERROR_ALLOW_ALL": True,
+    },
+)
+```
+
+Note: this is just a shallow merge.
+Also log level is automatically set in sync with the one
+dlt provides so providing it via `scrapy_settings` as `"LOG_LEVEL": "DEBUG"` will not work,
+please see [logging documentation](https://dlthub.com/docs/running-in-production/running#set-the-log-level-and-format) for dlt.
+
+## 🧐 Introspection using streamlit
+
+NOTE: you might need to set up `streamlit`, `pip install streamlit`
+
+```sh
+dlt pipeline <pipeline_name> show
+```
+
+## 🧠 How it works?
+
+Under the hood we run DLT [pipeline](https://dlthub.com/docs/api_reference/pipeline) in a separate thread while scrapy is running in the main thread.
+
+Communication between the two is done via the queue, where
+
+- Spider is responsible to put the results in the queue,
+- DLT resource collects and batches results from the queue.
+
+![simple diagram](./diagram.png)
+
+<p align="center"><strong>Enjoy it!<strong></p>
+<hr>
+<p align="center">✨ 🚀 ✨</p>
diff --git a/sources/scraping/__init__.py b/sources/scraping/__init__.py
@@ -0,0 +1,74 @@
+"""Scraping source
+
+Integrates Dlt and Scrapy to facilitate scraping pipelines.
+"""
+import inspect
+import typing as t
+
+import dlt
+
+from dlt.sources import DltResource
+from dlt.common.source import _SOURCES, SourceInfo
+
+from scrapy import Spider  # type: ignore
+
+from .helpers import ScrapingConfig, create_pipeline_runner
+from .types import P, AnyDict
+
+
+def run_pipeline(  # type: ignore[valid-type]
+    pipeline: dlt.Pipeline,
+    spider: t.Type[Spider],
+    *args: P.args,
+    on_before_start: t.Callable[[DltResource], None] = None,
+    scrapy_settings: t.Optional[AnyDict] = None,
+    batch_size: t.Optional[int] = None,
+    queue_size: t.Optional[int] = None,
+    queue_result_timeout: t.Optional[float] = None,
+    **kwargs: P.kwargs,
+) -> None:
+    """Simple runner for the scraping pipeline
+
+    You can pass all parameters via kwargs to `dlt.pipeline.run(....)`
+
+        ```
+        destination: TDestinationReferenceArg = None,
+        staging: TDestinationReferenceArg = None,
+        dataset_name: str = None,
+        credentials: Any = None,
+        table_name: str = None,
+        write_disposition: TWriteDisposition = None,
+        columns: TAnySchemaColumns = None,
+        primary_key: TColumnNames = None,
+        schema: Schema = None,
+        loader_file_format: TLoaderFileFormat = None
+        ```
+    """
+    options: AnyDict = {}
+    if scrapy_settings:
+        options["scrapy_settings"] = scrapy_settings
+
+    if batch_size:
+        options["batch_size"] = batch_size
+
+    if queue_size:
+        options["queue_size"] = queue_size
+
+    if queue_result_timeout:
+        options["queue_result_timeout"] = queue_result_timeout
+
+    scraping_host = create_pipeline_runner(pipeline, spider, **options)
+
+    if on_before_start:
+        on_before_start(scraping_host.pipeline_runner.scraping_resource)
+
+    scraping_host.run(*args, **kwargs)
+
+
+# This way we allow dlt init to detect scraping source it is indeed hacky
+# and the core team is working to provide a better alternative.
+_SOURCES[run_pipeline.__qualname__] = SourceInfo(
+    ScrapingConfig,
+    run_pipeline,
+    inspect.getmodule(run_pipeline),
+)
diff --git a/sources/scraping/diagram.png b/sources/scraping/diagram.png
diff --git a/sources/scraping/helpers.py b/sources/scraping/helpers.py
@@ -0,0 +1,99 @@
+import os
+import typing as t
+
+import dlt
+from dlt.common.configuration.inject import with_config
+from dlt.common.configuration.specs.base_configuration import (
+    configspec,
+    BaseConfiguration,
+)
+
+from scrapy import Spider  # type: ignore
+
+from .queue import ScrapingQueue
+from .settings import SOURCE_SCRAPY_QUEUE_SIZE, SOURCE_SCRAPY_SETTINGS
+from .runner import ScrapingHost, PipelineRunner, ScrapyRunner, Signals
+from .types import AnyDict
+
+
+@configspec
+class ScrapingConfig(BaseConfiguration):
+    # Batch size for scraped items
+    batch_size: int = 100
+
+    # maxsize for queue
+    queue_size: t.Optional[int] = SOURCE_SCRAPY_QUEUE_SIZE
+
+    # result wait timeout for our queue
+    queue_result_timeout: t.Optional[float] = 1.0
+
+    # List of start urls
+    start_urls: t.List[str] = None
+    start_urls_file: str = None
+
+
+@with_config(sections=("sources", "scraping"), spec=ScrapingConfig)
+def resolve_start_urls(
+    start_urls: t.Optional[t.List[str]] = dlt.config.value,
+    start_urls_file: t.Optional[str] = dlt.config.value,
+) -> t.List[str]:
+    """Merges start urls
+    If both `start_urls` and `start_urls_file` given, we will merge them
+    and return deduplicated list of `start_urls` for scrapy spider.
+    """
+    urls = set()
+    if os.path.exists(start_urls_file):
+        with open(start_urls_file, encoding="utf-8") as fp:
+            urls = {line for line in fp.readlines() if str(line).strip()}
+
+    if start_urls:
+        for url in start_urls:
+            urls.add(url)
+
+    return list(set(urls))
+
+
+@with_config(sections=("sources", "scraping"), spec=ScrapingConfig)
+def create_pipeline_runner(
+    pipeline: dlt.Pipeline,
+    spider: t.Type[Spider],
+    batch_size: int = dlt.config.value,
+    queue_size: int = dlt.config.value,
+    queue_result_timeout: float = dlt.config.value,
+    scrapy_settings: t.Optional[AnyDict] = None,
+) -> ScrapingHost:
+    queue = ScrapingQueue(  # type: ignore
+        maxsize=queue_size,
+        batch_size=batch_size,
+        read_timeout=queue_result_timeout,
+    )
+
+    signals = Signals(
+        pipeline_name=pipeline.pipeline_name,
+        queue=queue,
+    )
+
+    # Just to simple merge
+    settings = {**SOURCE_SCRAPY_SETTINGS}
+    if scrapy_settings:
+        settings = {**scrapy_settings}
+
+    scrapy_runner = ScrapyRunner(
+        spider=spider,
+        start_urls=resolve_start_urls(),
+        signals=signals,
+        settings=settings,
+    )
+
+    pipeline_runner = PipelineRunner(
+        pipeline=pipeline,
+        queue=queue,
+    )
+
+    scraping_host = ScrapingHost(
+        queue,
+        scrapy_runner,
+        pipeline_runner,
+    )
+
+    return scraping_host