openzim · benoit74 · Jul 25, 2023 · Jul 14, 2023 · Jul 14, 2023 · Jul 14, 2023
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -21,8 +21,8 @@ jobs:
 
       - name: Build packages
         run: |
-          pip install -U pip hatch
-          hatch build
+          pip install -U pip build
+          python -m build sdist wheel
 
       - name: Upload to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1.8
@@ -31,7 +31,6 @@ jobs:
         uses: openzim/docker-publish-action@v10
         with:
           image-name: openzim/kolibri
-          on-master: dev
           tag-pattern: /^v([0-9.]+)$/
           latest-on-tag: true
           restrict-to: openzim/kolibri

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- Add `--long-description` CLI parameter to set ZIM long description
+- Add `--node-ids` CLI parameter to process only few channel nodes (_useful for debugging mostly_)
+
 ### Fixed
+- Fixed issue with ZIM description too long when sourced from channel metadata
+- Fixed issue with ZIM icon sizes / formats
 - Fix issue with ePub rendering which was outside the iframe
 - Description is now limited to expected lenght and long description is set
 - Icons and illustrations are squared as expected
@@ -17,11 +23,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Migrate to our new Python standard (hatch, ruff, pyright, ...)
 - Using zimscraperlib 3.1.1
-- Updated image to `python:3.11-bullseye`
+- Updated image to `python:3.11-bookworm`
 - Retry video reencoding up to three times
 - Move inline javascript to dedicated files
 - Move huge inline CSS to dedicated file
-- Add `--node-ids` CLI parameter to process only few nodes (useful for debugging)
 
 ## [1.0.1] - 2023-02-22
 
@@ -34,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [1.0.0] - 2021-11-11
 
+### Added
 - initial version
 - supports topic/document/audio/video/html5/exercise content types
 - uses libzim7

diff --git a/hatch_build.py b/hatch_build.py
@@ -29,7 +29,7 @@ def initialize(self, version, build_data):
             return
         Path(self.root).joinpath("src/kolibri2zim/templates/assets")
         subprocess.run(
-            str(Path(self.root).joinpath("get_js_deps.sh")),  # : S603
+            str(Path(self.root).joinpath("get_js_deps.sh")),
             check=True,
         )
         return super().initialize(version, build_data)
@@ -38,7 +38,8 @@ def deps_already_installed(self) -> bool:
         for dep in JS_DEPS:
             if (
                 not Path(self.root)
-                .joinpath(f"src/kolibri2zim/templates/assets/{dep}")
+                .joinpath("src/kolibri2zim/templates/assets")
+                .joinpath(dep)
                 .exists()
             ):
                 return False

diff --git a/pyproject.toml b/pyproject.toml
@@ -162,6 +162,8 @@ select = [
 ignore = [
   # Allow non-abstract empty methods in abstract base classes
   "B027",
+  # Allow use of datetime with tz and date.today
+  "DTZ005", "DTZ011",
   # Remove flake8-errmsg since we consider they bloat the code and provide limited value
   "EM",
   # Allow boolean positional values in function calls, like `dict.get(... True)`

diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py
@@ -32,7 +32,7 @@
 from zimscraperlib.zim.creator import Creator
 from zimscraperlib.zim.items import StaticItem
 
-from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, get_logger
+from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, Global, get_logger
 from kolibri2zim.database import KolibriDB
 from kolibri2zim.debug import (
     ON_DISK_THRESHOLD,
@@ -108,11 +108,12 @@
 
         # zim params
         self.fname = go("fname")
-        tags = go("tags")
-        if tags is None:
-            self.tags = []
-        else:
-            self.tags = [t.strip() for t in tags.split(",")]
+        self.tags = (
+            []
+            if go("tags") is None
+            else [t.strip() for t in go("tags").split(",")]  # pyright: ignore
+        )
+
         self.title = go("title")
         self.description = go("description")
         self.long_description = go("long_description")
@@ -126,17 +127,14 @@
         self.css = go("css")
 
         # directory setup
-        self.output_dir = Path(str(go("output_dir"))).expanduser().resolve()
-        tmp_dir = go("tmp_dir")
-        if tmp_dir:
-            Path(tmp_dir).mkdir(parents=True, exist_ok=True)
-        self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir))
+        self.output_dir = Path(go("output_dir") or "/output").expanduser().resolve()
+        if go("tmp_dir"):
+            Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True)  # pyright: ignore
+        self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir")))
 
         # performances options
-        nb_threads_str = go("threads")
-        self.nb_threads = int(nb_threads_str) if nb_threads_str else None
-        nb_processes_str = go("processes")
-        self.nb_processes = int(nb_processes_str) if nb_processes_str else None
+        self.nb_threads = int(go("threads") or 1)
+        self.nb_processes = int(go("processes") or Global.nb_available_cpus)
         self.s3_url_with_credentials = go("s3_url_with_credentials")
         self.s3_storage = None
         self.dedup_html_files = go("dedup_html_files")
@@ -146,9 +144,10 @@
         self.keep_build_dir = go("keep_build_dir")
         self.debug = go("debug")
         self.only_topics = go("only_topics")
-        node_ids = go("node_ids")
         self.node_ids = (
-            None if node_ids is None else [t.strip() for t in node_ids.split(",")]
+            None
+            if go("node_ids") is None
+            else [t.strip() for t in go("node_ids").split(",")]  # pyright: ignore
         )
 
         # jinja2 environment setup
@@ -752,152 +751,151 @@
        logger.debug(f"Added HTML5 node #{node_id}")

    def run(self):
        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")

        s3_msg = (
            f"  using cache: {self.s3_storage.url.netloc} "
            f"with bucket: {self.s3_storage.bucket_name}"
            if self.s3_storage
            else ""
        )
        logger.info(
            f"Starting scraper with:\n"
            f"  channel_id: {self.channel_id}\n"
            f"  build_dir: {self.build_dir}\n"
            f"  output_dir: {self.output_dir}\n"
            f"  using webm : {self.use_webm}\n"
            f"  low_quality : {self.low_quality}\n"
            f"{s3_msg}"
        )

        self.ensure_js_deps_are_present()

        logger.info("Download database")
        self.download_db()

        self.sanitize_inputs()
        # display basic stats
        logger.info(
            f"  Starting ZIM creation with:\n"
            f"  filename: {self.fname}\n"
            f"  title: {self.title}\n"
            f"  description: {self.description}\n"
            f"  creator: {self.author}\n"
            f"  publisher: {self.publisher}\n"
            f"  tags: {';'.join(self.tags)}"
        )

        logger.info("Retrieving favicon")
        self.retrieve_favicon()

        logger.info("Setup Zim Creator")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.creator_lock = threading.Lock()
        if not self.root_id:
            logger.error("Missing root id")
            return 1
        if not self.title:
            logger.error("Missing title")
            return 1
        if not self.description:
            logger.error("Missing description")
            return 1
        if not self.author:
            logger.error("Missing author")
            return 1
        if not self.publisher:
            logger.error("Missing publisher")
            return 1
        self.creator = Creator(
            filename=self.output_dir.joinpath(self.clean_fname),
            main_path=self.root_id,
            ignore_duplicates=True,
        )
        self.creator.config_metadata(
            Name=self.clean_fname,
            Language="eng",
            Title=self.title,
            Description=self.description,
             LongDescription=self.long_description,
             Creator=self.author,
             Publisher=self.publisher,
-            Date=datetime.datetime.now(datetime.UTC),
+            Date=datetime.date.today(),
             Illustration_48x48_at_1=self.favicon_48_fpath.read_bytes(),
         )
         self.creator.start()

        succeeded = False
        try:
            self.add_favicon()
            self.add_custom_about_and_css()

            # add static files
            logger.info("Adding local files (assets)")
            self.add_local_files("assets", self.templates_dir.joinpath("assets"))

            # setup queue for nodes processing
            self.nodes_futures = {}  # future: node_id
            self.nodes_executor = cf.ThreadPoolExecutor(max_workers=self.nb_threads)

            # setup a dedicated queue for videos to convert
            self.videos_futures = {}  # future: src_fname, dst_fpath, path
            self.pending_upload = {}  # path: filepath, key, checksum
            self.videos_executor = cf.ProcessPoolExecutor(max_workers=self.nb_processes)

            logger.info("Starting nodes processing")
            self.populate_nodes_executor()

            # await completion of all futures (nodes and videos)
            result = cf.wait(
                self.videos_futures.keys() | self.nodes_futures.keys(),
                return_when=cf.FIRST_EXCEPTION,
            )
            self.nodes_executor.shutdown()
            # properly shutting down the executor should allow processing
            # futures's callbacks (zim addition) as the wait() function
            # only awaits future completion and doesn't include callbacks
            self.videos_executor.shutdown()

            succeeded = (
                not result.not_done
                and sum([1 if fs.exception() else 0 for fs in result.done]) == 0
            )

            # DEBUG: raise first exception
            if not succeeded and result.done:
                logger.info(
                     f"FAILURE not_done={len(result.not_done)} done={len(result.done)}"
                 )
                 for future in result.done:
-                    future_exception = future.exception()
-                    if future_exception:
-                        raise future_exception
+                    if future.exception():
+                        raise future.exception()  # pyright:ignore
         except KeyboardInterrupt:
             self.creator.can_finish = False
             logger.error("KeyboardInterrupt, exiting.")
        except Exception as exc:
            # request Creator not to create a ZIM file on finish
            self.creator.can_finish = False
            logger.error(f"Interrupting process due to error: {exc}")
            logger.exception(exc)
        finally:
            if succeeded:
                logger.info("Finishing ZIM file…")
            # we need to release libzim's resources.
            # currently does nothing but crash if can_finish=False but that's awaiting
            # impl. at libkiwix level
            with self.creator_lock:
                self.creator.finish()

        if not self.keep_build_dir:
            logger.info("Removing build folder")
            shutil.rmtree(self.build_dir, ignore_errors=True)

        return 0 if succeeded else 1

    def s3_credentials_ok(self):
        logger.info("testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(
@@ -925,64 +923,53 @@
        self.db = KolibriDB(fpath, self.root_id)
        self.root_id = self.db.root_id

    def sanitize_inputs(self):
         channel_meta = self.db.get_channel_metadata(self.channel_id)
 
         # input  & metadata sanitation
-        period = datetime.datetime.now(datetime.UTC).strftime("%Y-%m")
+        period = datetime.datetime.now().strftime("%Y-%m")
         if self.fname:
             # make sure we were given a filename and not a path
             fname_path = Path(str(self.fname).format(period=period))
             if Path(fname_path.name) != fname_path:
-                raise ValueError(f"filename is not a filename: {fname_path}")
+                raise ValueError(f"filename is not a filename: {self.fname}")
             self.clean_fname = str(fname_path)
         else:
             self.clean_fname = f"{self.name}_{period}.zim"

        if not self.title:
             self.title = channel_meta["name"]
         self.title = self.title.strip()
 
+        if self.description and len(self.description) > MAX_DESC_LENGTH:
+            raise ValueError(
+                f"Description too long ({len(self.description)}>{MAX_DESC_LENGTH})"
+            )
+        if self.long_description and len(self.long_description) > MAX_LONG_DESC_LENGTH:
+            raise ValueError(
+                f"LongDescription too long ({len(self.long_description)}"
+                f">{MAX_LONG_DESC_LENGTH})"
+            )
+
+        kolibri_desc = channel_meta["description"].strip()
+        if not self.long_description and len(kolibri_desc) > MAX_DESC_LENGTH:
+            self.long_description = kolibri_desc[0:MAX_LONG_DESC_LENGTH]
+            if len(kolibri_desc) > MAX_LONG_DESC_LENGTH:
+                self.long_description = self.long_description[:-1] + "…"
         if not self.description:
-            # User did not provided a description, we will infer it from channel
-            # metadata, limited to maximum length
-            if self.long_description:
-                raise ValueError(
-                    "long_description cannot be set if description is not set"
-                )
-            self.description = channel_meta["description"].strip()
-            if len(self.description) > MAX_DESC_LENGTH:
-                self.long_description = self.description
-                self.description = f"{self.description[0:MAX_DESC_LENGTH-1]}…"
-                if len(self.long_description) > MAX_LONG_DESC_LENGTH:
-                    self.long_description = (
-                        f"{self.long_description[0:MAX_LONG_DESC_LENGTH-1]}…"
-                    )
-        else:
-            self.description = self.description.strip()
-            if len(self.description) > MAX_DESC_LENGTH:
-                raise ValueError(
-                    f"description is too long ({len(self.description)}"
-                    f">{MAX_DESC_LENGTH})"
-                )
-            if (
-                self.long_description
-                and len(self.long_description) > MAX_LONG_DESC_LENGTH
-            ):
-                raise ValueError(
-                    f"long_description is too long ({len(self.long_description)}"
-                    f">{MAX_LONG_DESC_LENGTH})"
-                )
+            self.description = kolibri_desc[0:MAX_DESC_LENGTH]
+            if len(kolibri_desc) > MAX_DESC_LENGTH:
+                self.description = self.description[:-1] + "…"
 
         if not self.author:
             self.author = channel_meta["author"] or "Kolibri"
        self.author = self.author.strip()

        if not self.publisher:
            self.publisher = "Openzim"
        self.publisher = self.publisher.strip()

        self.tags = list({*self.tags, "_category:other", "kolibri", "_videos:yes"})

    def retrieve_favicon(self):
        favicon_orig = self.build_dir / "favicon"

diff --git a/tests/test_sanitize_inputs.py b/tests/test_sanitize_inputs.py
@@ -92,7 +92,7 @@ def test_sanitize_defaults_ok(scraper_generator: Callable[..., Kolibri2Zim]):
             LONG_TEXT[0:MAX_DESC_LEN],
             None,
         ),
-        # CLI description set and is too long, channel description doe not matter
+        # CLI description set and is too long, channel description does not matter
         (LONG_TEXT[0 : MAX_DESC_LEN + 1], None, TEXT_NOT_USED, True, None, None),
         # CLI description not set and channel description is short enough
         (None, None, LONG_TEXT[0:MAX_DESC_LEN], False, LONG_TEXT[0:MAX_DESC_LEN], None),
@@ -145,14 +145,14 @@ def test_sanitize_defaults_ok(scraper_generator: Callable[..., Kolibri2Zim]):
             None,
         ),
         # CLI description not set, CLI long descripion set and is short,
-        # channel description does not matter
+        # channel description set to something different than long desc
         (
             None,
             LONG_TEXT[0:MAX_LONG_DESC_LEN],
-            TEXT_NOT_USED,
-            True,
-            None,
-            None,
+            LONG_TEXT[10:MAX_LONG_DESC_LEN],
+            False,
+            LONG_TEXT[10 : MAX_DESC_LEN + 9] + "…",
+            LONG_TEXT[0:MAX_LONG_DESC_LEN],
         ),
     ],
 )