From e91b5d16e54ed569035a337e1773729357399bc3 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 08:58:02 +0200
Subject: [PATCH 01/45] Use standard .gitignore from toptal

---
 .gitignore | 249 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 245 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index fb4df5f..d4c96ac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,178 @@
+# Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,database,visualstudiocode,intellij
+# Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,database,visualstudiocode,intellij
 
+### Database ###
+*.accdb
+*.db
+*.dbf
+*.mdb
+*.pdb
+*.sqlite3
+*.db-shm
+*.db-wal
+
+### Intellij ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Intellij Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+
+# Azure Toolkit for IntelliJ plugin
+# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
+.idea/**/azureSettings.xml
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -21,7 +195,6 @@ parts/
 sdist/
 var/
 wheels/
-pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
@@ -51,6 +224,7 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+cover/
 
 # Translations
 *.mo
@@ -73,6 +247,7 @@ instance/
 docs/_build/
 
 # PyBuilder
+.pybuilder/
 target/
 
 # Jupyter Notebook
@@ -83,7 +258,9 @@ profile_default/
 ipython_config.py
 
 # pyenv
-.python-version
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -92,7 +269,22 @@ ipython_config.py
 #   install all needed dependencies.
 #Pipfile.lock
 
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 
 # Celery stuff
@@ -129,7 +321,50 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-.DS_Store
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,database,visualstudiocode,intellij
+
 
 # assets that we download
 kolibri2zim/templates/assets/bootstrap/
@@ -143,3 +378,9 @@ kolibri2zim/templates/assets/epub.min.js
 kolibri2zim/templates/assets/bootstrap-icons/
 kolibri2zim/templates/assets/jszip.min.js
 kolibri2zim/templates/assets/perseus/
+
+# output dir
+output
+
+# ignore all vscode, this is not standard configuration in this place
+.vscode

From 6afd9a98071af00f75dca210124e70c6583a3a08 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 08:58:47 +0200
Subject: [PATCH 02/45] Fail get_js_deps script on download / unzip errors

---
 get_js_deps.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/get_js_deps.sh b/get_js_deps.sh
index ff10f77..9936269 100755
--- a/get_js_deps.sh
+++ b/get_js_deps.sh
@@ -1,5 +1,7 @@
 #!/bin/sh
 
+set -e
+
 ###
 # download JS dependencies and place them in our templates/assets folder
 # then launch our ogv.js script to fix dynamic loading links

From e043236f324c8ec90273e5a5b19083d56247e087 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 08:59:44 +0200
Subject: [PATCH 03/45] Migration version info to __about__.py to match
 standard

---
 kolibri2zim/VERSION      | 1 -
 kolibri2zim/__about__.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 kolibri2zim/VERSION
 create mode 100644 kolibri2zim/__about__.py

diff --git a/kolibri2zim/VERSION b/kolibri2zim/VERSION
deleted file mode 100644
index 7dea76e..0000000
--- a/kolibri2zim/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-1.0.1
diff --git a/kolibri2zim/__about__.py b/kolibri2zim/__about__.py
new file mode 100644
index 0000000..f4c3a84
--- /dev/null
+++ b/kolibri2zim/__about__.py
@@ -0,0 +1 @@
+__version__ = "1.1.0-dev0"

From 6caf0d467171f7622d23a02c2680bc9235ce766e Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:00:30 +0200
Subject: [PATCH 04/45] Migrate to pyproject.toml + hatch instead of setuptools

---
 hatch_build.py |  16 +++++
 pyproject.toml | 183 +++++++++++++++++++++++++++++++++++++++++++++++++
 setup.py       |  53 --------------
 tasks.py       |  76 ++++++++++++++++++++
 4 files changed, 275 insertions(+), 53 deletions(-)
 create mode 100644 hatch_build.py
 create mode 100644 pyproject.toml
 delete mode 100644 setup.py
 create mode 100644 tasks.py

diff --git a/hatch_build.py b/hatch_build.py
new file mode 100644
index 0000000..66b1f69
--- /dev/null
+++ b/hatch_build.py
@@ -0,0 +1,16 @@
+import logging
+import subprocess
+from pathlib import Path
+
+from hatchling.builders.hooks.plugin.interface import BuildHookInterface
+
+logger = logging.getLogger(__name__)
+
+
+class GetJsDepsHook(BuildHookInterface):
+    def initialize(self, version, build_data):
+        subprocess.run(
+            Path(self.root).joinpath("get_js_deps.sh").as_posix(),  # noqa : S603
+            check=True,
+        )
+        return super().initialize(version, build_data)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..2f44f8a
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,183 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "kolibri2zim"
+authors = [{ name = "Kiwix", email = "dev@kiwix.org" }]
+keywords = ["kiwix", "zim", "offline", "kolibri"]
+requires-python = ">=3.11"
+description = "Make ZIM file from Kolibri Channels"
+readme = "README.md"
+license = { text = "GPL-3.0-or-later" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+]
+dependencies = [
+    "zimscraperlib==3.1.0",
+    "kiwixstorage==0.8.3",
+    "Jinja2==3.1.2",
+    "pif==0.8.2",
+    "beautifulsoup4==4.9.3",
+    "retrying==1.3.4",
+]
+dynamic = ["version"]
+
+[project.optional-dependencies]
+scripts = ["invoke==2.1.3"]
+lint = ["black==23.3.0", "ruff==0.0.272"]
+check = ["pyright==1.1.317"]
+dev = [
+    "debugpy",
+    "kolibri2zim[scripts]",
+    "kolibri2zim[lint]",
+    "kolibri2zim[check]",
+]
+
+[project.urls]
+Homepage = "https://github.com/openzim/kolibri"
+Donate = "https://www.kiwix.org/en/support-us/"
+
+[project.scripts]
+kolibri2zim = "kolibri2zim:entrypoint.main"
+
+[tool.hatch.version]
+path = "kolibri2zim/__about__.py"
+
+[tool.hatch.build]
+exclude = ["/.github"]
+
+[tool.hatch.build.hooks.custom]
+
+[tool.hatch.envs.default]
+features = ["dev"]
+
+[tool.hatch.envs.lint]
+template = "lint"
+python = "py311"
+skip-install = false
+features = ["scripts", "lint"]
+
+[tool.hatch.envs.lint.scripts]
+black = "inv lint-black --args '{args}'"
+ruff = "inv lint-ruff --args '{args}'"
+all = "inv lintall --args '{args}'"
+fix-black = "inv fix-black --args '{args}'"
+fix-ruff = "inv fix-ruff --args '{args}'"
+fixall = "inv fixall --args '{args}'"
+
+[tool.hatch.envs.check]
+features = ["scripts", "check"]
+
+[tool.hatch.envs.check.scripts]
+pyright = "inv check-pyright --args '{args}'"
+all = "inv checkall --args '{args}'"
+
+[tool.black]
+line-length = 88
+target-version = ['py311']
+
+[tool.ruff]
+target-version = "py311"
+line-length = 88
+src = ["kolibri2zim"]
+select = [
+    # "A", # flake8-builtins
+    # "ANN",  # flake8-annotations
+    "ARG", # flake8-unused-arguments
+    # "ASYNC",  # flake8-async
+    # "B", # flake8-bugbear
+    # "BLE",  # flake8-blind-except
+    "C4",  # flake8-comprehensions
+    "C90", # mccabe
+    # "COM",  # flake8-commas
+    # "D",  # pydocstyle
+    # "DJ",  # flake8-django
+    "DTZ", # flake8-datetimez
+    "E",   # pycodestyle (default)
+    "EM",  # flake8-errmsg
+    # "ERA",  # eradicate
+    # "EXE",  # flake8-executable
+    "F", # Pyflakes (default)
+    # "FA",  # flake8-future-annotations
+    "FBT", # flake8-boolean-trap
+    # "FLY",  # flynt
+    # "G",  # flake8-logging-format
+    "I",   # isort
+    "ICN", # flake8-import-conventions
+    # "INP",  # flake8-no-pep420
+    # "INT",  # flake8-gettext
+    "ISC", # flake8-implicit-str-concat
+    "N",   # pep8-naming
+    # "NPY",  # NumPy-specific rules
+    # "PD",  # pandas-vet
+    # "PGH",  # pygrep-hooks
+    # "PIE",  # flake8-pie
+    # "PL",  # Pylint
+    "PLC", # Pylint: Convention
+    "PLE", # Pylint: Error
+    "PLR", # Pylint: Refactor
+    "PLW", # Pylint: Warning
+    # "PT",  # flake8-pytest-style
+    # "PTH",  # flake8-use-pathlib
+    # "PYI",  # flake8-pyi
+    "Q", # flake8-quotes
+    # "RET",  # flake8-return
+    # "RSE",  # flake8-raise
+    "RUF", # Ruff-specific rules
+    "S",   # flake8-bandit
+    # "SIM",  # flake8-simplify
+    # "SLF",  # flake8-self
+    "T10", # flake8-debugger
+    "T20", # flake8-print
+    # "TCH",  # flake8-type-checking
+    # "TD",  # flake8-todos
+    "TID", # flake8-tidy-imports
+    # "TRY",  # tryceratops
+    "UP",  # pyupgrade
+    "W",   # pycodestyle
+    "YTT", # flake8-2020
+]
+ignore = [
+    # Allow non-abstract empty methods in abstract base classes
+    "B027",
+    # Allow boolean positional values in function calls, like `dict.get(... True)`
+    "FBT003",
+    # Ignore checks for possible passwords
+    "S105",
+    "S106",
+    "S107",
+    # Ignore complexity
+    "C901",
+    "PLR0911",
+    "PLR0912",
+    "PLR0913",
+    "PLR0915",
+]
+unfixable = [
+    # Don't touch unused imports
+    "F401",
+]
+
+[tool.ruff.isort]
+known-first-party = ["kolibri2zim"]
+
+[tool.ruff.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.per-file-ignores]
+# Tests can use magic values, assertions, and relative imports
+"tests/**/*" = ["PLR2004", "S101", "TID252"]
+
+[tool.pyright]
+pythonVersion = "3.11"
+pythonPlatform = "All"
+typeCheckingMode = "basic"
+
+include = ["kolibri2zim"]
+exclude = ["**/node_modules",
+    "**/__pycache__",
+    "kolibri2zim/templates",
+]
diff --git a/setup.py b/setup.py
deleted file mode 100644
index c6b1eab..0000000
--- a/setup.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# vim: ai ts=4 sts=4 et sw=4 nu
-
-import pathlib
-import subprocess
-from setuptools import setup
-
-root_dir = pathlib.Path(__file__).parent
-
-
-def read(*names, **kwargs):
-    with open(root_dir.joinpath(*names), "r") as fh:
-        return fh.read()
-
-
-print("Downloading and fixing JS dependencies...")
-subprocess.run([str(root_dir.joinpath("get_js_deps.sh").resolve())], check=True)
-
-
-setup(
-    name="kolibri2zim",
-    version=read("kolibri2zim", "VERSION").strip(),
-    description="Make ZIM file from Kolibri Channels",
-    long_description=read("README.md"),
-    long_description_content_type="text/markdown",
-    author="satyamtg",
-    author_email="io.satyamtg@gmail.com",
-    url="https://github.com/openzim/kolibri2zim",
-    keywords="kiwix zim offline kolibri",
-    license="GPLv3+",
-    packages=["kolibri2zim"],
-    install_requires=[
-        line.strip()
-        for line in read("requirements.txt").splitlines()
-        if not line.strip().startswith("#")
-    ],
-    zip_safe=False,
-    include_package_data=True,
-    entry_points={
-        "console_scripts": [
-            "kolibri2zim=kolibri2zim.__main__:main",
-        ]
-    },
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Intended Audience :: Developers",
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3.8",
-        "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
-    ],
-    python_requires=">=3.6",
-)
diff --git a/tasks.py b/tasks.py
new file mode 100644
index 0000000..424223b
--- /dev/null
+++ b/tasks.py
@@ -0,0 +1,76 @@
+# pyright: strict, reportUntypedFunctionDecorator=false
+import os
+
+from invoke.context import Context
+from invoke.tasks import task  # pyright: ignore [reportUnknownVariableType]
+
+use_pty = not os.getenv("CI", "")
+
+
+@task(
+    optional=["args"], help={"args": "linting tools (black, ruff) additional arguments"}
+)
+def lint_black(ctx: Context, args: str | None = ""):
+    args = args or "."
+    ctx.run("black --version", pty=use_pty)
+    ctx.run(f"black --check --diff {args}", pty=use_pty)
+
+
+@task(
+    optional=["args"], help={"args": "linting tools (black, ruff) additional arguments"}
+)
+def lint_ruff(ctx: Context, args: str | None = ""):
+    args = args or "."
+    ctx.run("ruff --version", pty=use_pty)
+    ctx.run(f"ruff check {args}", pty=use_pty)
+
+
+@task(
+    optional=["args"], help={"args": "linting tools (black, ruff) additional arguments"}
+)
+def lintall(ctx: Context, args: str | None = ""):
+    """check linting"""
+    args = args or "."
+    lint_black(ctx, args)
+    lint_ruff(ctx, args)
+
+
+@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
+def check_pyright(ctx: Context, args: str | None = ""):
+    """check static types with pyright"""
+    args = args or ""
+    ctx.run("pyright --version")
+    ctx.run(f"pyright {args}", pty=use_pty)
+
+
+@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
+def checkall(ctx: Context, args: str | None = ""):
+    """check static types"""
+    args = args or ""
+    check_pyright(ctx, args)
+
+
+@task(optional=["args"], help={"args": "black additional arguments"})
+def fix_black(ctx: Context, args: str | None = ""):
+    """fix black formatting"""
+    args = args or "."
+    ctx.run(f"black {args}", pty=use_pty)  # type: ignore
+
+
+@task(optional=["args"], help={"args": "ruff additional arguments"})
+def fix_ruff(ctx: Context, args: str | None = ""):
+    """fix all ruff rules"""
+    args = args or "."
+    ctx.run(f"ruff --fix {args}", pty=use_pty)  # type: ignore
+
+
+@task(
+    optional=["args"],
+    help={"args": "linting (fix mode) tools (black, ruff) additional arguments"},
+)
+def fixall(ctx: Context, args: str | None = ""):
+    """fix everything automatically"""
+    args = args or "."
+    fix_black(ctx, args)
+    fix_ruff(ctx, args)
+    lintall(ctx, args)

From 222727d1b67976dada9fb67beee626cec87e3eca Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:00:44 +0200
Subject: [PATCH 05/45] Add pre-commit configuration

---
 .pre-commit-config.yaml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..577ac69
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,27 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+  -   id: trailing-whitespace
+  -   id: end-of-file-fixer
+- repo: https://github.com/psf/black
+  rev: "23.3.0"
+  hooks:
+  -   id: black
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.0.272
+  hooks:
+  - id: ruff
+- repo: https://github.com/RobertCraigie/pyright-python
+  rev: v1.1.315
+  hooks:
+  - id: pyright
+    name: pyright (system)
+    description: 'pyright static type checker'
+    entry: pyright
+    language: system
+    'types_or': [python, pyi]
+    require_serial: true
+    minimum_pre_commit_version: '2.9.2'

From a2fabb1dc132e44a45494db2604869f410c1dd34 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:07:51 +0200
Subject: [PATCH 06/45] Fix simple issues raised by ruff

---
 dump_channel_to_fs.py     |  9 +++----
 kolibri2zim/__main__.py   |  3 +--
 kolibri2zim/constants.py  |  5 ++--
 kolibri2zim/database.py   |  3 +--
 kolibri2zim/debug.py      |  7 +++---
 kolibri2zim/entrypoint.py |  7 +++---
 kolibri2zim/processing.py |  4 +--
 kolibri2zim/scraper.py    | 51 ++++++++++++++++++++++-----------------
 8 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py
index ad066f7..298a9f0 100755
--- a/dump_channel_to_fs.py
+++ b/dump_channel_to_fs.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
 
@@ -18,14 +17,14 @@
 
     Uses wget for downloads """
 
+import contextlib
+import logging
+import multiprocessing as mp
 import os
-import sys
 import pathlib
-import logging
 import sqlite3
-import contextlib
 import subprocess
-import multiprocessing as mp
+import sys
 
 STUDIO_DEFAULT_BASE_URL = "https://studio.learningequality.org"
 STUDIO_URL = os.getenv("STUDIO_URL", STUDIO_DEFAULT_BASE_URL)
diff --git a/kolibri2zim/__main__.py b/kolibri2zim/__main__.py
index 03b42a7..5615eb7 100644
--- a/kolibri2zim/__main__.py
+++ b/kolibri2zim/__main__.py
@@ -1,9 +1,8 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
-import sys
 import pathlib
+import sys
 
 
 def main():
diff --git a/kolibri2zim/constants.py b/kolibri2zim/constants.py
index 578c8fc..3495c2a 100644
--- a/kolibri2zim/constants.py
+++ b/kolibri2zim/constants.py
@@ -1,11 +1,10 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
-import os
-import pathlib
 import logging
 import multiprocessing
+import os
+import pathlib
 
 from zimscraperlib.logging import getLogger as lib_getLogger
 
diff --git a/kolibri2zim/database.py b/kolibri2zim/database.py
index 3ce6b3c..22f2d70 100644
--- a/kolibri2zim/database.py
+++ b/kolibri2zim/database.py
@@ -1,9 +1,8 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
-import pathlib
 import logging
+import pathlib
 import sqlite3
 
 logger = logging.getLogger(__name__)
diff --git a/kolibri2zim/debug.py b/kolibri2zim/debug.py
index b6fe2cd..66832e5 100644
--- a/kolibri2zim/debug.py
+++ b/kolibri2zim/debug.py
@@ -1,11 +1,10 @@
 import io
 import logging
 import pathlib
-from typing import Optional, Tuple
 
 import requests
 from retrying import retry
-from zimscraperlib.download import stream_file, _get_retry_adapter
+from zimscraperlib.download import _get_retry_adapter, stream_file
 from zimscraperlib.video.encoding import reencode
 
 logging.basicConfig(level=logging.DEBUG)
@@ -41,8 +40,8 @@ def get_size_and_mime(url: str) -> Tuple[int, str]:
 @retry(stop_max_attempt_number=5, wait_exponential_multiplier=20000)
 def download_to(
     url: str,
-    fpath: Optional[pathlib.Path] = None,
-    byte_stream: Optional[io.IOBase] = None,
+    fpath: pathlib.Path | None = None,
+    byte_stream: io.BytesIO | None = None,
 ):
     logger.debug(f"download_to({url=}) {'to-file' if fpath else 'to-mem'}")
     stream_file(url, fpath=fpath, byte_stream=byte_stream)
diff --git a/kolibri2zim/entrypoint.py b/kolibri2zim/entrypoint.py
index 93f9f28..fc2f0f3 100755
--- a/kolibri2zim/entrypoint.py
+++ b/kolibri2zim/entrypoint.py
@@ -1,11 +1,10 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
-import sys
 import argparse
+import sys
 
-from .constants import NAME, SCRAPER, Global, getLogger, setDebug
+from kolibri2zim.constants import NAME, SCRAPER, Global, get_logger, set_debug
 
 
 def main():
@@ -193,7 +192,7 @@ def main():
     setDebug(args.debug)
     logger = getLogger()
 
-    from .scraper import Kolibri2Zim
+    from kolibri2zim.scraper import Kolibri2Zim
 
     try:
         scraper = Kolibri2Zim(**dict(args._get_kwargs()))
diff --git a/kolibri2zim/processing.py b/kolibri2zim/processing.py
index 6de0f4f..635f9aa 100644
--- a/kolibri2zim/processing.py
+++ b/kolibri2zim/processing.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
 from zimscraperlib.video.encoding import reencode
@@ -27,7 +26,8 @@ def post_process_video(video_dir, video_id, preset, video_format, low_quality):
         raise FileNotFoundError(f"Missing video file in {video_dir}")
     if len(files) > 1:
         logger.warning(
-            f"Multiple video file candidates for {video_id} in {video_dir}. Picking {files[0]} out of {files}"
+            f"Multiple video file candidates for {video_id} in {video_dir}. "
+            f"Picking {files[0]} out of {files}"
         )
     src_path = files[0]
 
diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 28d59c1..5fa208c 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -1,36 +1,40 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
-import io
-import shutil
 import base64
-import zipfile
+import concurrent.futures as cf
 import datetime
-import tempfile
-import threading
 import hashlib
+import io
 import json
+import shutil
+import tempfile
+import threading
+import zipfile
 from pathlib import Path
-from typing import Optional
-import concurrent.futures as cf
+from typing import Any
 
 import jinja2
 from bs4 import BeautifulSoup
-from pif import get_public_ip
 from kiwixstorage import KiwixStorage
-from zimscraperlib.zim.creator import Creator
-from zimscraperlib.zim.items import StaticItem
+from pif import get_public_ip
+from zimscraperlib.filesystem import get_file_mimetype
 from zimscraperlib.i18n import find_language_names
-from zimscraperlib.inputs import handle_user_provided_file
 from zimscraperlib.image.convertion import convert_image, create_favicon
 from zimscraperlib.image.transformation import resize_image
-from zimscraperlib.filesystem import get_file_mimetype
-from zimscraperlib.video.presets import VideoWebmLow, VideoWebmHigh, VideoMp4Low
+from zimscraperlib.inputs import handle_user_provided_file
+from zimscraperlib.video.presets import VideoMp4Low, VideoWebmHigh, VideoWebmLow
+from zimscraperlib.zim.creator import Creator
+from zimscraperlib.zim.items import StaticItem
 
-from .constants import ROOT_DIR, getLogger, STUDIO_URL
-from .database import KolibriDB
-from .debug import ON_DISK_THRESHOLD, download_to, get_size_and_mime, safer_reencode
+from kolibri2zim.constants import ROOT_DIR, STUDIO_URL, get_logger
+from kolibri2zim.database import KolibriDB
+from kolibri2zim.debug import (
+    ON_DISK_THRESHOLD,
+    download_to,
+    get_size_and_mime,
+    safer_reencode,
+)
 
 logger = getLogger()
 options = [
@@ -58,7 +62,7 @@
     "about",
     "css",
     "dedup_html_files",
-    "node_ids"
+    "node_ids",
 ]
 NOSTREAM_FUNNEL_SIZE = 1024  # 2**20 * 2  # 2MiB
 
@@ -198,7 +202,12 @@ def funnel_file(self, fid, fext):
         url, fname = get_kolibri_url_for(fid, fext)
         size, mimetype = get_size_and_mime(url)
 
-        item_kw = dict(path=fname, title="", mimetype=mimetype, delete_fpath=True)
+        item_kw = {
+            "path": fname,
+            "title": "",
+            "mimetype": mimetype,
+            "delete_fpath": True,
+        }
 
         if not size or size >= ON_DISK_THRESHOLD:
             item_kw["fpath"] = Path(
@@ -347,7 +356,6 @@ def add_video_node(self, node_id):
 
             # funnel from S3 cache if it is present there
             if not self.funnel_from_s3(vfid, path, vchk, preset):
-
                 # download original video
                 src = self.download_to_disk(vid, video_file["ext"])
                 dst = src.with_suffix(".webm")
@@ -365,7 +373,6 @@ def add_video_node(self, node_id):
 
             # funnel from S3 cache if it is present there
             if not self.funnel_from_s3(vfid, path, vchk, preset):
-
                 # download original video
                 src = self.download_to_disk(vid, video_file["ext"])
 
@@ -905,7 +912,7 @@ def sanitize_inputs(self):
             self.publisher = "Openzim"
         self.publisher = self.publisher.strip()
 
-        self.tags = list(set(self.tags + ["_category:other", "kolibri", "_videos:yes"]))
+        self.tags = list({*self.tags, "_category:other", "kolibri", "_videos:yes"})
 
     def retrieve_favicon(self):
         favicon_orig = self.build_dir / "favicon"

From bc5785d03e79bb2633b774dc94eb43a6ced441b4 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:10:28 +0200
Subject: [PATCH 07/45] Fix subtle issues raised by pyright / ruff

---
 dump_channel_to_fs.py     |  39 +++++--
 kolibri2zim/__main__.py   |   2 +-
 kolibri2zim/constants.py  |   9 +-
 kolibri2zim/database.py   |  44 ++++---
 kolibri2zim/debug.py      |   6 +-
 kolibri2zim/entrypoint.py |   4 +-
 kolibri2zim/processing.py |   8 +-
 kolibri2zim/scraper.py    | 233 ++++++++++++++++++++++++--------------
 8 files changed, 216 insertions(+), 129 deletions(-)

diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py
index 298a9f0..87ab761 100755
--- a/dump_channel_to_fs.py
+++ b/dump_channel_to_fs.py
@@ -35,7 +35,7 @@
 logger = logging.getLogger("dump-remote")
 
 
-def download_if_missing(url, fpath, fsize=None, force=False):
+def download_if_missing(url, fpath, fsize=None, *, force=False):
     skipped = (
         fpath.exists()
         and (fsize is not None and os.path.getsize(fpath) == fsize)
@@ -44,7 +44,7 @@ def download_if_missing(url, fpath, fsize=None, force=False):
     if not skipped:
         fpath.unlink(missing_ok=True)
         wget = subprocess.run(
-            [
+            [  # noqa: S603
                 "/usr/bin/env",
                 "wget",
                 "-t",
@@ -62,7 +62,8 @@ def download_if_missing(url, fpath, fsize=None, force=False):
         )
         if wget.returncode != 0:
             logger.error(wget.stdout)
-            raise Exception(f"wget exited with retcode {wget.returncode}")
+            msg = f"wget exited with retcode {wget.returncode}"
+            raise Exception(msg)
     return not skipped, url, fpath
 
 
@@ -89,12 +90,11 @@ def get_rows(db_path, query):
         cursor = conn.execute(query)
         rows = cursor.fetchmany()
         while rows:
-            for row in rows:
-                yield row
+            yield from rows
             rows = cursor.fetchmany()
 
 
-def dump(channel_id, build_dir=None, force=False):
+def dump(channel_id: str, build_dir: str | None, *, force: bool):
     build_path = pathlib.Path(build_dir or "build")
     logger.info(f"dumping {channel_id} into {build_path}")
     build_path.mkdir(exist_ok=True, parents=True)
@@ -112,7 +112,7 @@ def dump(channel_id, build_dir=None, force=False):
     nb_files = get_single_value(db_path, "SELECT COUNT(*) FROM content_file")
     logger.info(f"Looping over all {nb_files} files")
 
-    def on_error(*args, **kwargs):
+    def on_error(*args, **kwargs):  # noqa: ARG001
         logger.error("Failed to download something")
 
     def on_success(result):
@@ -145,8 +145,27 @@ def on_success(result):
     logger.info("Done downloading files")
 
 
+CHANNEL_ID_POS_IN_ARGV = 2
+BUILD_DIR_POS_IN_ARGV = 3
+FORCE_POS_IN_ARGV = 4
+
 if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Missing channel ID")
+    if len(sys.argv) < CHANNEL_ID_POS_IN_ARGV:
+        logger.error("Missing channel ID")
         sys.exit(1)
-    dump(*sys.argv[1:])
+
+    channel_id = sys.argv[1]
+    if len(sys.argv) >= BUILD_DIR_POS_IN_ARGV:
+        build_dir = sys.argv[2]
+    else:
+        build_dir = None
+    if len(sys.argv) >= FORCE_POS_IN_ARGV:
+        force = (
+            sys.argv[3].lower() == "true"
+            or sys.argv[3].lower() == "force"
+            or sys.argv[3].lower() == "yes"
+        )
+    else:
+        force = False
+
+    dump(channel_id=channel_id, build_dir=build_dir, force=force)
diff --git a/kolibri2zim/__main__.py b/kolibri2zim/__main__.py
index 5615eb7..c595cdc 100644
--- a/kolibri2zim/__main__.py
+++ b/kolibri2zim/__main__.py
@@ -7,7 +7,7 @@
 
 def main():
     # allows running it from source using python kolibri2zim
-    sys.path = [str(pathlib.Path(__file__).parent.parent.resolve())] + sys.path
+    sys.path = [str(pathlib.Path(__file__).parent.parent.resolve()), *sys.path]
 
     from kolibri2zim.entrypoint import main as entry
 
diff --git a/kolibri2zim/constants.py b/kolibri2zim/constants.py
index 3495c2a..6d7a7bb 100644
--- a/kolibri2zim/constants.py
+++ b/kolibri2zim/constants.py
@@ -11,7 +11,7 @@
 ROOT_DIR = pathlib.Path(__file__).parent
 NAME = ROOT_DIR.name
 
-with open(ROOT_DIR.joinpath("VERSION"), "r") as fh:
+with open(ROOT_DIR.joinpath("VERSION")) as fh:
     VERSION = fh.read().strip()
 
 SCRAPER = f"{NAME} {VERSION}"
@@ -25,7 +25,7 @@ def is_running_inside_container():
     if not fpath.exists():
         return False
     try:
-        with open(fpath, "r") as fh:
+        with open(fpath) as fh:
             for line in fh.readlines():
                 if line.strip().rsplit(":", 1)[-1] != "/":
                     return True
@@ -37,6 +37,7 @@ def is_running_inside_container():
 class Global:
     debug = False
     inside_container = is_running_inside_container()
+    nb_available_cpus: int
 
 
 Global.nb_available_cpus = (
@@ -44,11 +45,11 @@ class Global:
 )
 
 
-def setDebug(debug):
+def set_debug(debug):
     """toggle constants global DEBUG flag (used by getLogger)"""
     Global.debug = bool(debug)
 
 
-def getLogger():
+def get_logger():
     """configured logger respecting DEBUG flag"""
     return lib_getLogger(NAME, level=logging.DEBUG if Global.debug else logging.INFO)
diff --git a/kolibri2zim/database.py b/kolibri2zim/database.py
index 22f2d70..5c0f0d2 100644
--- a/kolibri2zim/database.py
+++ b/kolibri2zim/database.py
@@ -22,7 +22,7 @@ class KolibriDB:
     Kolibri uses the Modified Preorder Tree Traversal model, from django-mptt
     https://gist.github.com/tmilos/f2f999b5839e2d42d751"""
 
-    def __init__(self, fpath: pathlib.Path, root_id: str = None):
+    def __init__(self, fpath: pathlib.Path, root_id: str | None = None):
         self.conn = sqlite3.connect(
             f"file:{fpath.expanduser().resolve()}?mode=ro",
             uri=True,
@@ -36,7 +36,8 @@ def __init__(self, fpath: pathlib.Path, root_id: str = None):
 
         self.root = self.get_node(root_id)
         if not self.root:
-            raise ValueError(f"No node for root-id {root_id}")
+            msg = f"No node for root-id {root_id}"
+            raise ValueError(msg)
 
     @property
     def fpath(self):
@@ -73,8 +74,7 @@ def get_rows(self, query, *args, **kwargs):
             cursor = conn.execute(query, *args, **kwargs)
             rows = cursor.fetchmany()
             while rows:
-                for row in rows:
-                    yield row
+                yield from rows
                 rows = cursor.fetchmany()
 
     def get_channel_metadata(self, channel_id):
@@ -94,8 +94,7 @@ def get_node_descendants(self, node_id, left=None, right=None):
             "ORDER BY level ASC",
             (left, right),
         ):
-            row = dict(row)
-            yield row
+            yield dict(row)
 
     def get_node_children(self, node_id, left=None, right=None):
         if left is None or right is None:
@@ -110,17 +109,17 @@ def get_node_children(self, node_id, left=None, right=None):
             "ORDER BY level ASC",
             (left, right, node_id),
         ):
-            row = dict(row)
-            row.update(
+            rowdict = dict(row)
+            rowdict.update(
                 {
-                    "thumbnail": self.get_thumbnail_name(row["id"]),
+                    "thumbnail": self.get_thumbnail_name(rowdict["id"]),
                 }
             )
-            yield row
+            yield rowdict
 
     def get_node_children_count(self, node_id, left=None, right=None):
         if left is None or right is None:
-            node = self.get_node(with_parents=False, with_children=False)
+            node = self.get_node(node_id, with_parents=False, with_children=False)
             left = node["left"]
             right = node["right"]
 
@@ -132,7 +131,7 @@ def get_node_children_count(self, node_id, left=None, right=None):
 
     def get_node_parents(self, node_id, left=None, right=None):
         if left is None or right is None:
-            node = self.get_node(with_parents=False, with_children=False)
+            node = self.get_node(node_id, with_parents=False, with_children=False)
             left = node["left"]
             right = node["right"]
 
@@ -147,7 +146,7 @@ def get_node_parents(self, node_id, left=None, right=None):
 
     def get_node_parents_count(self, node_id, left=None, right=None):
         if left is None or right is None:
-            node = self.get_node(with_parents=False, with_children=False)
+            node = self.get_node(node_id, with_parents=False, with_children=False)
             left = node["left"]
             right = node["right"]
 
@@ -159,7 +158,7 @@ def get_node_parents_count(self, node_id, left=None, right=None):
             (left, right, self.root_left, self.root_right),
         )
 
-    def get_node(self, node_id, with_parents=False, with_children=False):
+    def get_node(self, node_id, *, with_parents=False, with_children=False):
         node = self.get_row(
             "SELECT id, title, description, author, level, kind, "
             "license_name as license, license_owner, "
@@ -195,13 +194,13 @@ def get_node(self, node_id, with_parents=False, with_children=False):
             )
         return node
 
-    def get_node_file(self, node_id, thumbnail=False):
+    def get_node_file(self, node_id, *, thumbnail=False):
         try:
-            return next(self.get_node_files(node_id, thumbnail))
+            return next(self.get_node_files(node_id, thumbnail=thumbnail))
         except StopIteration:
             return None
 
-    def get_node_files(self, node_id, thumbnail=False):
+    def get_node_files(self, node_id, *, thumbnail=False):
         for row in self.get_rows(
             "SELECT id as fid, local_file_id as id, "
             "extension as ext, priority as prio, "
@@ -210,7 +209,16 @@ def get_node_files(self, node_id, thumbnail=False):
             "ORDER BY priority ASC",
             (node_id, 1, 1 if thumbnail else 0),
         ):
-            yield dict(row)
+            yield {
+                "id": row["id"],
+                "fid": row["fid"],
+                "ext": row["ext"],
+                "prio": row["prio"],
+                "supp": row["supp"],
+                "checksum": row["checksum"],
+                "lang": row["lang"],
+                "preset": row["preset"],
+            }
 
     def get_node_thumbnail(self, node_id):
         return self.get_node_file(node_id, thumbnail=True)
diff --git a/kolibri2zim/debug.py b/kolibri2zim/debug.py
index 66832e5..0077113 100644
--- a/kolibri2zim/debug.py
+++ b/kolibri2zim/debug.py
@@ -18,9 +18,11 @@
 
 # retry up to 3 times, with delay from 40s
 @retry(stop_max_attempt_number=3, wait_exponential_multiplier=20000)
-def get_size_and_mime(url: str) -> Tuple[int, str]:
+def get_size_and_mime(url: str) -> tuple[int | None, str]:
     logger.debug(f"get_size_and_mime({url=})")
-    _, headers = stream_file(url, byte_stream=io.BytesIO(), only_first_block=True)
+    _, headers = stream_file(
+        url, byte_stream=io.BytesIO(), only_first_block=True
+    )  # type: ignore # see https://github.com/openzim/python-scraperlib/issues/104
     mimetype = headers.get("Content-Type", "application/octet-stream")
     # Encoded data (compressed) prevents us from using Content-Length header
     # as source for the content (it represents length of compressed data)
diff --git a/kolibri2zim/entrypoint.py b/kolibri2zim/entrypoint.py
index fc2f0f3..98576e8 100755
--- a/kolibri2zim/entrypoint.py
+++ b/kolibri2zim/entrypoint.py
@@ -189,8 +189,8 @@ def main():
     )
 
     args = parser.parse_args()
-    setDebug(args.debug)
-    logger = getLogger()
+    set_debug(args.debug)
+    logger = get_logger()
 
     from kolibri2zim.scraper import Kolibri2Zim
 
diff --git a/kolibri2zim/processing.py b/kolibri2zim/processing.py
index 635f9aa..49a3088 100644
--- a/kolibri2zim/processing.py
+++ b/kolibri2zim/processing.py
@@ -3,10 +3,9 @@
 
 from zimscraperlib.video.encoding import reencode
 
-from .constants import getLogger
+from kolibri2zim.constants import get_logger
 
-
-logger = getLogger()
+logger = get_logger()
 
 
 def post_process_video(video_dir, video_id, preset, video_format, low_quality):
@@ -23,7 +22,8 @@ def post_process_video(video_dir, video_id, preset, video_format, low_quality):
     if len(files) == 0:
         logger.error(f"Video file missing in {video_dir} for {video_id}")
         logger.debug(list(video_dir.iterdir()))
-        raise FileNotFoundError(f"Missing video file in {video_dir}")
+        msg = f"Missing video file in {video_dir}"
+        raise FileNotFoundError(msg)
     if len(files) > 1:
         logger.warning(
             f"Multiple video file candidates for {video_id} in {video_dir}. "
diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 5fa208c..ac9bbc0 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -36,7 +36,7 @@
     safer_reencode,
 )
 
-logger = getLogger()
+logger = get_logger()
 options = [
     "debug",
     "name",
@@ -79,20 +79,34 @@ def get_kolibri_url_for(file_id: str, ext: str):
     return f"{STUDIO_URL}/content/storage/{remote_path}", fname
 
 
-def read_from_zip(ark, member, as_text: Optional[bool] = True):
-    data = ark.open(member).read()
-    return data.decode("utf-8") if as_text else data
+def read_from_zip_as_bytes(ark, member):
+    return ark.open(member).read()
+
+
+def read_from_zip_as_text(ark, member):
+    return read_from_zip_as_bytes(ark, member).decode("utf-8")
 
 
 class Kolibri2Zim:
     def __init__(self, **kwargs):
-
         for option in options:
             if option not in kwargs:
-                raise ValueError(f"Missing parameter `{option}`")
+                msg = f"Missing parameter `{option}`"
+                raise ValueError(msg)
 
         def go(option):
-            return kwargs.get(option)
+            res = kwargs.get(option)
+            if type(res) is str:
+                return res
+            else:
+                return None
+
+        def gom(option):
+            res = go(option)
+            if not res:
+                msg = f"Unexpected kind of option for {option}"
+                raise Exception(msg)
+            return res
 
         self.channel_id = go("channel_id")
         self.root_id = go("root_id")
@@ -104,9 +118,11 @@ def go(option):
 
         # zim params
         self.fname = go("fname")
-        self.tags = (
-            [] if go("tags") is None else [t.strip() for t in go("tags").split(",")]
-        )
+        tags = go("tags")
+        if tags is None:
+            self.tags = []
+        else:
+            self.tags = [t.strip() for t in tags.split(",")]
         self.title = go("title")
         self.description = go("description")
         self.author = go("creator")
@@ -119,14 +135,14 @@ def go(option):
         self.css = go("css")
 
         # directory setup
-        self.output_dir = Path(go("output_dir")).expanduser().resolve()
+        self.output_dir = Path(str(go("output_dir"))).expanduser().resolve()
         if go("tmp_dir"):
-            Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True)
+            Path(str(go("tmp_dir"))).mkdir(parents=True, exist_ok=True)
         self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir")))
 
         # performances options
-        self.nb_threads = go("threads")
-        self.nb_processes = go("processes")
+        self.nb_threads = int(gom("threads"))
+        self.nb_processes = int(gom("processes"))
         self.s3_url_with_credentials = go("s3_url_with_credentials")
         self.s3_storage = None
         self.dedup_html_files = go("dedup_html_files")
@@ -136,8 +152,9 @@ def go(option):
         self.keep_build_dir = go("keep_build_dir")
         self.debug = go("debug")
         self.only_topics = go("only_topics")
+        node_ids = go("node_ids")
         self.node_ids = (
-            None if go("node_ids") is None else [t.strip() for t in go("node_ids").split(",")]
+            None if node_ids is None else [t.strip() for t in node_ids.split(",")]
         )
 
         # jinja2 environment setup
@@ -265,12 +282,10 @@ def funnel_from_s3(self, file_id, path, checksum, preset):
 
         # add to zim
         with self.creator_lock:
-            self.creator.add_item(
-                StaticItem(
-                    path=path,
-                    fileobj=fileobj,
-                    mimetype=preset.mimetype,
-                )
+            self.creator.add_item_for(
+                path=path,
+                content=fileobj.read(),
+                mimetype=preset.mimetype,
             )
         logger.debug(f"Added {path} from S3::{key}")
         return True
@@ -305,7 +320,10 @@ def add_topic_node(self, node_id):
         )
         with self.creator_lock:
             self.creator.add_item_for(
-                path=node_id, title=node["title"], content=html, mimetype="text/html"
+                path=node_id,
+                title=node["title"],
+                content=html.encode(),
+                mimetype="text/html",
             )
         logger.debug(f"Added topic #{node_id}")
 
@@ -317,24 +335,23 @@ def add_video_node(self, node_id):
         subtitle files (`video_subtitle`) are VTT files and are only limited by the
         number of language to select from in kolibri studio"""
 
-        files = self.db.get_node_files(node_id, thumbnail=False)
-        if not files:
+        files = list(self.db.get_node_files(node_id, thumbnail=False))
+        if len(files) == 0:
             return
         files = sorted(files, key=lambda f: f["prio"])
-        it = filter(lambda f: f["supp"] == 0, files)
+        it: list[dict[str, Any]] = list(filter(lambda f: f["supp"] == 0, files))
 
-        try:
-            # find main video file
-            video_file = next(it)
-        except StopIteration:
+        if len(it) == 0:
             # we have no video file
             return
-
-        try:
-            alt_video_file = next(it)
-        except StopIteration:
+        elif len(it) == 1:
             # we have no supplementary video file (which is OK)
+            video_file = it[0]
             alt_video_file = None
+        else:
+            # we have video and alt video
+            video_file = it[0]
+            alt_video_file = it[1]
 
         # now decide which file to keep and what to do with it
 
@@ -388,7 +405,9 @@ def add_video_node(self, node_id):
 
         # we want mp4, either in high-q or we have a low_res file to use
         else:
-            video_file = alt_video_file if self.low_quality else video_file
+            video_file = (
+                alt_video_file if self.low_quality and alt_video_file else video_file
+            )
             self.funnel_file(video_file["id"], video_file["ext"])
             video_filename = filename_for(video_file)
             video_filename_ext = video_file["ext"]
@@ -401,14 +420,14 @@ def add_video_node(self, node_id):
                 local, english = find_language_names(file["lang"])
             except Exception:
                 english = file["lang"]
-            finally:
-                subtitles.append(
-                    {
-                        "code": file["lang"],
-                        "name": english,
-                        "filename": filename_for(file),
-                    }
-                )
+
+            subtitles.append(
+                {
+                    "code": file["lang"],
+                    "name": english,
+                    "filename": filename_for(file),
+                }
+            )
 
         node = self.db.get_node(node_id, with_parents=True)
         html = self.jinja2_env.get_template("video.html").render(
@@ -424,7 +443,7 @@ def add_video_node(self, node_id):
             self.creator.add_item_for(
                 path=node_id,
                 title=node["title"],
-                content=html,
+                content=html.encode(),
                 mimetype="text/html",
             )
         logger.debug(f"Added video #{node_id}")
@@ -435,7 +454,10 @@ def add_video_upon_completion(self, future):
         logs error in case of failure"""
         if future.cancelled():
             return
-        src_fname, dst_fpath, path = self.videos_futures.get(future)
+        res = self.videos_futures.get(future)
+        if not res:
+            return
+        src_fname, dst_fpath, path = res
 
         try:
             future.result()
@@ -494,7 +516,10 @@ def request_s3_upload_and_removal(self, item):
         """add file from item to uploads list"""
         path = item.path
         del item
-        dest_fpath, key, meta = self.pending_upload.get(path)
+        res = self.pending_upload.get(path)
+        if not res:
+            return
+        dest_fpath, key, meta = res
         # TODO: submit to a thread executor (to create) instead
         # this is currently called on main-tread.
         self.upload_to_s3(key, dest_fpath, **meta)
@@ -521,7 +546,7 @@ def add_audio_node(self, node_id):
             self.creator.add_item_for(
                 path=node_id,
                 title=node["title"],
-                content=html,
+                content=html.encode(),
                 mimetype="text/html",
             )
         logger.debug(f"Added audio #{node_id}")
@@ -554,7 +579,7 @@ def add_exercise_node(self, node_id):
         if manifest_name not in zip_ark.namelist():
             logger.error(f"Excercise node without {manifest_name}")
             return
-        manifest = json.loads(read_from_zip(zip_ark, manifest_name))
+        manifest = json.loads(read_from_zip_as_bytes(zip_ark, manifest_name))
 
         # copy exercise content, rewriting internal paths
         # all internal resources to be stored under {node_id}/ prefix
@@ -562,14 +587,14 @@ def add_exercise_node(self, node_id):
         for assessment_item in manifest.get("all_assessment_items", []):
             item_path = f"{assessment_item}.json"
             if item_path in zip_ark.namelist():
-                perseus_content = read_from_zip(zip_ark, item_path)
+                perseus_content = read_from_zip_as_text(zip_ark, item_path)
                 perseus_content = perseus_content.replace(
                     r"web+graphie:${☣ LOCALPATH}", f"web+graphie:./{node_id}"
                 )
                 perseus_content = perseus_content.replace(
                     r"${☣ LOCALPATH}", f"./{node_id}"
                 )
-            assessment_items.append(perseus_content)
+                assessment_items.append(perseus_content)
 
         # add all support files to ZIM
         for ark_member in zip_ark.namelist():
@@ -581,12 +606,12 @@ def add_exercise_node(self, node_id):
                 self.creator.add_item_for(
                     path=path,
                     title="",
-                    content=read_from_zip(zip_ark, ark_member, as_text=False),
+                    content=read_from_zip_as_bytes(zip_ark, ark_member),
                 )
             logger.debug(f"Added exercise support file {path}")
 
         # prepare and add exercise HTML article
-        node = self.db.get_node(node_id, with_parents=True)
+        node = self.db.get_node(node_id, with_parents=True, with_children=False)
         html = self.jinja2_env.get_template("perseus_exercise.html").render(
             node_id=node_id,
             perseus_content=f"[{', '.join(assessment_items)}]",
@@ -595,7 +620,10 @@ def add_exercise_node(self, node_id):
         )
         with self.creator_lock:
             self.creator.add_item_for(
-                path=node_id, title=node["title"], content=html, mimetype="text/html"
+                path=node_id,
+                title=node["title"],
+                content=html.encode(),
+                mimetype="text/html",
             )
         logger.debug(f"Added exercise node #{node_id}")
 
@@ -671,7 +699,7 @@ def target_for(file):
                 self.creator.add_item_for(
                     path=path,
                     title=node["title"],
-                    content=html,
+                    content=html.encode(),
                     mimetype="text/html",
                 )
         logger.debug(f"Added document #{node_id}")
@@ -709,7 +737,7 @@ def add_html5_node(self, node_id):
 
             # calculate hash of file and add entry if not in zim already
             content = zip_ark.open(ark_member).read()
-            content_hash = hashlib.md5(content).hexdigest()  # nosec
+            content_hash = hashlib.md5(content).hexdigest()  # nosec # noqa: S324
 
             if content_hash not in self.html_files_cache:
                 self.html_files_cache.append(content_hash)
@@ -730,7 +758,8 @@ def add_html5_node(self, node_id):
 
     def run(self):
         if self.s3_url_with_credentials and not self.s3_credentials_ok():
-            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
+            msg = "Unable to connect to Optimization Cache. Check its URL."
+            raise ValueError(msg)
 
         s3_msg = (
             f"  using cache: {self.s3_storage.url.netloc} "
@@ -770,19 +799,34 @@ def run(self):
         self.output_dir.mkdir(parents=True, exist_ok=True)
 
         self.creator_lock = threading.Lock()
+        if not self.root_id:
+            logger.error("Missing root id")
+            return 1
+        if not self.title:
+            logger.error("Missing title")
+            return 1
+        if not self.description:
+            logger.error("Missing description")
+            return 1
+        if not self.author:
+            logger.error("Missing author")
+            return 1
+        if not self.publisher:
+            logger.error("Missing publisher")
+            return 1
         self.creator = Creator(
-            filename=self.output_dir.joinpath(self.fname),
+            filename=self.output_dir.joinpath(self.clean_fname),
             main_path=self.root_id,
             ignore_duplicates=True,
         )
         self.creator.config_metadata(
-            Name=self.name,
+            Name=self.clean_fname,
             Language="eng",
             Title=self.title,
             Description=self.description,
             Creator=self.author,
             Publisher=self.publisher,
-            Date=datetime.date.today().strftime("%Y-%d-%m"),
+            Date=datetime.datetime.now(datetime.UTC).strftime("%Y-%d-%m"),
             Illustration_48x48_at_1=self.favicon_fpath.read_bytes(),
         )
         self.creator.start()
@@ -830,8 +874,9 @@ def run(self):
                     f"FAILURE not_done={len(result.not_done)} done={len(result.done)}"
                 )
                 for future in result.done:
-                    if future.exception():
-                        raise future.exception()
+                    future_exception = future.exception()
+                    if future_exception:
+                        raise future_exception
         except KeyboardInterrupt:
             self.creator.can_finish = False
             logger.error("KeyboardInterrupt, exiting.")
@@ -887,18 +932,21 @@ def sanitize_inputs(self):
         channel_meta = self.db.get_channel_metadata(self.channel_id)
 
         # input  & metadata sanitation
-        period = datetime.datetime.now().strftime("%Y-%m")
+        period = datetime.datetime.now(datetime.UTC).strftime("%Y-%m")
         if self.fname:
             # make sure we were given a filename and not a path
-            self.fname = Path(self.fname.format(period=period))
-            if Path(self.fname.name) != self.fname:
-                raise ValueError(f"filename is not a filename: {self.fname}")
+            fname_path = Path(self.fname.format(period=period)).resolve()
+            if not fname_path.is_file():
+                msg = f"filename is not a filename: {self.fname}"
+                raise ValueError(msg)
+            self.clean_fname = fname_path.as_posix()
         else:
-            self.fname = f"{self.name}_{period}.zim"
+            self.clean_fname = f"{self.name}_{period}.zim"
 
         if not self.title:
-            self.title = channel_meta["name"]
-        self.title = self.title.strip()
+            self.clean_title = channel_meta["name"].strip()
+        else:
+            self.clean_title = self.title.strip()
 
         if not self.description:
             self.description = channel_meta["description"]
@@ -969,18 +1017,24 @@ def add_custom_about_and_css(self):
 
         if self.about:
             # if user provided a custom about page, use it
-            with open(
-                handle_user_provided_file(
-                    source=self.about, in_dir=self.build_dir, nocopy=True
-                ),
-                "r",
-            ) as fh:
-                soup = BeautifulSoup(fh.read(), "lxml")
-                title = soup.find("title").text
-                content = soup.select("body > .container")
-                # we're only interested in the first one
-                if isinstance(content, list):
-                    content = content[0]
+            user_provided_file = handle_user_provided_file(
+                source=self.about, in_dir=self.build_dir, nocopy=True
+            )
+            if not user_provided_file:
+                title = channel_meta["name"]
+                content = None
+            else:
+                with open(user_provided_file) as fh:
+                    soup = BeautifulSoup(fh.read(), "lxml")
+                    title = soup.find("title")
+                    if not title:
+                        msg = "Failed to extract title"
+                        raise Exception(msg)
+                    title = title.text
+                    content = soup.select("body > .container")
+                    # we're only interested in the first one
+                    if isinstance(content, list):
+                        content = content[0]
         else:
             title = channel_meta["name"]
             content = None
@@ -992,23 +1046,26 @@ def add_custom_about_and_css(self):
             self.creator.add_item_for(
                 path="about",
                 title=title,
-                content=html,
+                content=html.encode(),
                 mimetype="text/html",
             )
         del html
 
         # if user provided a custom CSS file, use it
         if self.css:
-            with open(
-                handle_user_provided_file(
-                    source=self.css, in_dir=self.build_dir, nocopy=True
-                ),
-                "r",
-            ) as fh:
-                content = fh.read()
+            user_provided_file = handle_user_provided_file(
+                source=self.css, in_dir=self.build_dir, nocopy=True
+            )
+            if not user_provided_file:
+                content = ""
+            else:
+                with open(user_provided_file) as fh:
+                    content = fh.read()
         # otherwise, create a blank one
         else:
             content = ""
 
-        self.creator.add_item_for("custom.css", content=content, mimetype="text/css")
+        self.creator.add_item_for(
+            "custom.css", content=content.encode(), mimetype="text/css"
+        )
         logger.debug("Added about page and custom CSS")

From e1686d0e6e670ca5c2bc3b340df14e62e304946a Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:16:19 +0200
Subject: [PATCH 08/45] Adapt CHANGELOG + fix typo

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6eb19cd..bfc2123 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,11 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-## Fixed
+### Fixed
 - Fix issue with ePub rendering which was outside the iframe
+- Many small fixes (including some bugs) detected by ruff / pyright
 
 ### Changed
 
+- Migrate to our new Python standard (hatch, ruff, pyright, ...)
 - Using zimscraperlib 3.1.0
 - Updated image to `python:3.11-bullseye`
 - Retry video reencoding up to three times

From 1777e57bb0f6d2fbdec583f2b322c97263c3dac5 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:30:31 +0200
Subject: [PATCH 09/45] Add QA worfklow + publish on releases only, to PyPi in
 addition to Docker

---
 .github/workflows/docker.yml  | 29 -------------------
 .github/workflows/publish.yml | 52 +++++++++++++++++++++++++++++++++++
 .github/workflows/pull.yml    | 10 +++++++
 .github/workflows/push.yml    |  8 ++++++
 .github/workflows/qa.yml      | 34 +++++++++++++++++++++++
 5 files changed, 104 insertions(+), 29 deletions(-)
 delete mode 100644 .github/workflows/docker.yml
 create mode 100644 .github/workflows/publish.yml
 create mode 100644 .github/workflows/pull.yml
 create mode 100644 .github/workflows/push.yml
 create mode 100644 .github/workflows/qa.yml

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index 2ca5fb4..0000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Docker
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - v*
-
-jobs:
-  build-and-push:
-    name: Deploy Docker Image
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v3.4.0
-      - name: Build and push
-        uses: openzim/docker-publish-action@v10
-        with:
-          image-name: openzim/kolibri
-          on-master: dev
-          tag-pattern: /^v([0-9.]+)$/
-          latest-on-tag: true
-          restrict-to: openzim/kolibri
-          registries: ghcr.io
-          credentials:
-            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
-            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
-          repo_description: auto
-          repo_overview: auto
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..fd9d884
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,52 @@
+name: Build and upload to PyPI
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  publish:
+    runs-on: ubuntu-22.04
+    permissions:
+      id-token: write # mandatory for PyPI trusted publishing
+
+    steps:
+      - uses: actions/checkout@v3.5.3
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4.6.1
+        with:
+          python-version: "3.11"
+          architecture: x64
+
+      - name: Build packages
+        run: |
+          pip install -U pip build
+          python -m build sdist wheel
+
+      - name: Upload to PyPI
+        uses: pypa/gh-action-pypi-publish@v1.8.6
+        # dont specify anything for Trusted Publishing
+        # https://docs.pypi.org/trusted-publishers
+        # with:
+        #  # Using token
+        #  user: __token__
+        #  password: ${{ secrets.PYPI_API_TOKEN }}
+        #
+        #  # Using token on test index
+        #  password: ${{ secrets.PYPI_TEST_API_TOKEN }}
+        #  repository_url: https://test.pypi.org/legacy/
+
+      - name: Build and push Docker image
+        uses: openzim/docker-publish-action@v10
+        with:
+          image-name: openzim/kolibri
+          on-master: dev
+          tag-pattern: /^v([0-9.]+)$/
+          latest-on-tag: true
+          restrict-to: openzim/kolibri
+          registries: ghcr.io
+          credentials: GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+          repo_description: auto
+          repo_overview: auto
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
new file mode 100644
index 0000000..be2b9de
--- /dev/null
+++ b/.github/workflows/pull.yml
@@ -0,0 +1,10 @@
+name: Pull Request
+
+on:
+  pull_request:
+
+jobs:
+  qa:
+    uses: ./.github/workflows/qa.yml
+    # run qa job if the pull request originates from a fork (otherwise the qa is already triggered by the push to a branch)
+    if: github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
new file mode 100644
index 0000000..d84f392
--- /dev/null
+++ b/.github/workflows/push.yml
@@ -0,0 +1,8 @@
+name: Push
+
+on:
+  push:
+
+jobs:
+  qa:
+    uses: ./.github/workflows/qa.yml
diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml
new file mode 100644
index 0000000..8927ab5
--- /dev/null
+++ b/.github/workflows/qa.yml
@@ -0,0 +1,34 @@
+name: QA
+
+on:
+  workflow_call:
+
+jobs:
+  check-qa:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v3.5.3
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4.6.1
+        with:
+          python-version: "3.11"
+          architecture: x64
+
+      - name: Install dependencies (and project)
+        run: |
+          pip install -U pip
+          pip install -e .[lint,scripts,test]
+
+      - name: Check black formatting
+        run: inv lint-black
+
+      - name: Check ruff
+        run: inv lint-ruff
+
+      # Installs and run pyright (node). Easier/faster alt. to: inv check-pyright
+      - name: Check with pyright
+        uses: jakebailey/pyright-action@v1.6.0
+        with:
+          version: 1.1.311

From 8a87e34bf3f24b9c78ab853d9428c035937b1982 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 18 Jul 2023 18:04:32 +0200
Subject: [PATCH 10/45] WIP

---
 .github/workflows/publish.yml | 15 +++------------
 .github/workflows/qa.yml      |  9 +++------
 dump_channel_to_fs.py         |  3 +--
 kolibri2zim/database.py       |  3 +--
 kolibri2zim/processing.py     |  3 +--
 kolibri2zim/scraper.py        | 15 +++++----------
 pyproject.toml                |  1 +
 7 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index fd9d884..ef88218 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -16,7 +16,7 @@ jobs:
       - name: Set up Python 3.11
         uses: actions/setup-python@v4.6.1
         with:
-          python-version: "3.11"
+          python-version-file: "pyproject.toml"
           architecture: x64
 
       - name: Build packages
@@ -26,16 +26,6 @@ jobs:
 
       - name: Upload to PyPI
         uses: pypa/gh-action-pypi-publish@v1.8.6
-        # dont specify anything for Trusted Publishing
-        # https://docs.pypi.org/trusted-publishers
-        # with:
-        #  # Using token
-        #  user: __token__
-        #  password: ${{ secrets.PYPI_API_TOKEN }}
-        #
-        #  # Using token on test index
-        #  password: ${{ secrets.PYPI_TEST_API_TOKEN }}
-        #  repository_url: https://test.pypi.org/legacy/
 
       - name: Build and push Docker image
         uses: openzim/docker-publish-action@v10
@@ -46,7 +36,8 @@ jobs:
           latest-on-tag: true
           restrict-to: openzim/kolibri
           registries: ghcr.io
-          credentials: GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+          credentials:
+            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
             GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
           repo_description: auto
           repo_overview: auto
diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml
index 8927ab5..abf892b 100644
--- a/.github/workflows/qa.yml
+++ b/.github/workflows/qa.yml
@@ -13,7 +13,7 @@ jobs:
       - name: Set up Python 3.11
         uses: actions/setup-python@v4.6.1
         with:
-          python-version: "3.11"
+          python-version-file: 'pyproject.toml'
           architecture: x64
 
       - name: Install dependencies (and project)
@@ -27,8 +27,5 @@ jobs:
       - name: Check ruff
         run: inv lint-ruff
 
-      # Installs and run pyright (node). Easier/faster alt. to: inv check-pyright
-      - name: Check with pyright
-        uses: jakebailey/pyright-action@v1.6.0
-        with:
-          version: 1.1.311
+      - name: Check pyright
+        run: inv check-pyright
diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py
index 87ab761..4ed042c 100755
--- a/dump_channel_to_fs.py
+++ b/dump_channel_to_fs.py
@@ -62,8 +62,7 @@ def download_if_missing(url, fpath, fsize=None, *, force=False):
         )
         if wget.returncode != 0:
             logger.error(wget.stdout)
-            msg = f"wget exited with retcode {wget.returncode}"
-            raise Exception(msg)
+            raise Exception(f"wget exited with retcode {wget.returncode}")
     return not skipped, url, fpath
 
 
diff --git a/kolibri2zim/database.py b/kolibri2zim/database.py
index 5c0f0d2..99b5541 100644
--- a/kolibri2zim/database.py
+++ b/kolibri2zim/database.py
@@ -36,8 +36,7 @@ def __init__(self, fpath: pathlib.Path, root_id: str | None = None):
 
         self.root = self.get_node(root_id)
         if not self.root:
-            msg = f"No node for root-id {root_id}"
-            raise ValueError(msg)
+            raise ValueError(f"No node for root-id {root_id}")
 
     @property
     def fpath(self):
diff --git a/kolibri2zim/processing.py b/kolibri2zim/processing.py
index 49a3088..6132850 100644
--- a/kolibri2zim/processing.py
+++ b/kolibri2zim/processing.py
@@ -22,8 +22,7 @@ def post_process_video(video_dir, video_id, preset, video_format, low_quality):
     if len(files) == 0:
         logger.error(f"Video file missing in {video_dir} for {video_id}")
         logger.debug(list(video_dir.iterdir()))
-        msg = f"Missing video file in {video_dir}"
-        raise FileNotFoundError(msg)
+        raise FileNotFoundError(f"Missing video file in {video_dir}")
     if len(files) > 1:
         logger.warning(
             f"Multiple video file candidates for {video_id} in {video_dir}. "
diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index ac9bbc0..bdb6a5d 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -91,8 +91,7 @@ class Kolibri2Zim:
     def __init__(self, **kwargs):
         for option in options:
             if option not in kwargs:
-                msg = f"Missing parameter `{option}`"
-                raise ValueError(msg)
+                raise ValueError(f"Missing parameter `{option}`")
 
         def go(option):
             res = kwargs.get(option)
@@ -104,8 +103,7 @@ def go(option):
         def gom(option):
             res = go(option)
             if not res:
-                msg = f"Unexpected kind of option for {option}"
-                raise Exception(msg)
+                raise Exception(f"Unexpected kind of option for {option}")
             return res
 
         self.channel_id = go("channel_id")
@@ -758,8 +756,7 @@ def add_html5_node(self, node_id):
 
     def run(self):
         if self.s3_url_with_credentials and not self.s3_credentials_ok():
-            msg = "Unable to connect to Optimization Cache. Check its URL."
-            raise ValueError(msg)
+            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
 
         s3_msg = (
             f"  using cache: {self.s3_storage.url.netloc} "
@@ -937,8 +934,7 @@ def sanitize_inputs(self):
             # make sure we were given a filename and not a path
             fname_path = Path(self.fname.format(period=period)).resolve()
             if not fname_path.is_file():
-                msg = f"filename is not a filename: {self.fname}"
-                raise ValueError(msg)
+                raise ValueError(f"filename is not a filename: {self.fname}")
             self.clean_fname = fname_path.as_posix()
         else:
             self.clean_fname = f"{self.name}_{period}.zim"
@@ -1028,8 +1024,7 @@ def add_custom_about_and_css(self):
                     soup = BeautifulSoup(fh.read(), "lxml")
                     title = soup.find("title")
                     if not title:
-                        msg = "Failed to extract title"
-                        raise Exception(msg)
+                        raise Exception("Failed to extract title")
                     title = title.text
                     content = soup.select("body > .container")
                     # we're only interested in the first one
diff --git a/pyproject.toml b/pyproject.toml
index 2f44f8a..6776f2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -143,6 +143,7 @@ select = [
 ignore = [
     # Allow non-abstract empty methods in abstract base classes
     "B027",
+    "EM",
     # Allow boolean positional values in function calls, like `dict.get(... True)`
     "FBT003",
     # Ignore checks for possible passwords

From b74cdaf9c1e3b1fcc0848a047d60dc1b3c974a6a Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 20 Jul 2023 17:23:23 +0200
Subject: [PATCH 11/45] Use str instead of as_posix

---
 hatch_build.py         | 2 +-
 kolibri2zim/scraper.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hatch_build.py b/hatch_build.py
index 66b1f69..d98f9e3 100644
--- a/hatch_build.py
+++ b/hatch_build.py
@@ -10,7 +10,7 @@
 class GetJsDepsHook(BuildHookInterface):
     def initialize(self, version, build_data):
         subprocess.run(
-            Path(self.root).joinpath("get_js_deps.sh").as_posix(),  # noqa : S603
+            str(Path(self.root).joinpath("get_js_deps.sh")),  # noqa : S603
             check=True,
         )
         return super().initialize(version, build_data)
diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index bdb6a5d..4c48923 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -935,7 +935,7 @@ def sanitize_inputs(self):
             fname_path = Path(self.fname.format(period=period)).resolve()
             if not fname_path.is_file():
                 raise ValueError(f"filename is not a filename: {self.fname}")
-            self.clean_fname = fname_path.as_posix()
+            self.clean_fname = str(fname_path)
         else:
             self.clean_fname = f"{self.name}_{period}.zim"
 

From ae2372c8c5a356ef0b1ed25c1e5c767fc3ab9338 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:32:35 +0200
Subject: [PATCH 12/45] Source version from appropriate new file

---
 kolibri2zim/constants.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kolibri2zim/constants.py b/kolibri2zim/constants.py
index 6d7a7bb..f8a27fb 100644
--- a/kolibri2zim/constants.py
+++ b/kolibri2zim/constants.py
@@ -8,11 +8,12 @@
 
 from zimscraperlib.logging import getLogger as lib_getLogger
 
+from kolibri2zim.__about__ import __version__
+
 ROOT_DIR = pathlib.Path(__file__).parent
 NAME = ROOT_DIR.name
 
-with open(ROOT_DIR.joinpath("VERSION")) as fh:
-    VERSION = fh.read().strip()
+VERSION = __version__
 
 SCRAPER = f"{NAME} {VERSION}"
 

From 05fb7a606d28ac123ba7f72e1a52aae9edc06c57 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:34:03 +0200
Subject: [PATCH 13/45] Fix Dockerfile for new build system

---
 Dockerfile     | 11 ++++++-----
 install.sh     |  5 +++++
 pyproject.toml |  4 ++++
 3 files changed, 15 insertions(+), 5 deletions(-)
 create mode 100755 install.sh

diff --git a/Dockerfile b/Dockerfile
index 274c715..89a7302 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,13 +5,14 @@ LABEL org.opencontainers.image.source https://github.com/openzim/kolibri2zim
 RUN apt-get update -y \
     && apt-get install -y --no-install-recommends locales-all unzip ffmpeg \
     && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* \
+    && python -m pip install -U pip hatch
 
-COPY requirements.txt /src/
-RUN pip3 install --no-cache-dir -r /src/requirements.txt
+#COPY requirements.txt /src/
+#RUN pip3 install --no-cache-dir -r /src/requirements.txt
 COPY kolibri2zim /src/kolibri2zim
-COPY setup.py *.md get_js_deps.sh MANIFEST.in /src/
-RUN cd /src/ && ./get_js_deps.sh && python3 ./setup.py install
+COPY pyproject.toml *.md get_js_deps.sh install.sh MANIFEST.in LICENSE *.py /src/
+RUN cd /src/ && HATCH_BUILD_HOOKS_ENABLE=true hatch build -t sdist && ./install.sh
 
 # default output directory
 RUN mkdir -p /output
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..0b6a66b
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,5 @@
+search_dir=/src/dist/*.tar.gz
+for entry in $search_dir
+do
+  pip install "$entry"
+done
diff --git a/pyproject.toml b/pyproject.toml
index 6776f2d..ca1854c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dev = [
     "kolibri2zim[scripts]",
     "kolibri2zim[lint]",
     "kolibri2zim[check]",
+    "hatchling",
 ]
 
 [project.urls]
@@ -50,6 +51,9 @@ path = "kolibri2zim/__about__.py"
 exclude = ["/.github"]
 
 [tool.hatch.build.hooks.custom]
+enable-by-default = false
+path = "hatch_build.py"
+dependencies = ["zimscraperlib==3.1.0"]
 
 [tool.hatch.envs.default]
 features = ["dev"]

From 9d1daaaf9a94d6605bfbbd59094838468a41dd20 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:34:34 +0200
Subject: [PATCH 14/45] Remove obsolete file

---
 requirements.txt | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index fe0d483..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-## Direct dependencies
-zimscraperlib==3.1.0
-kiwixstorage==0.8.3
-Jinja2==3.1.2
-pif==0.8.2
-beautifulsoup4==4.9.3
-retrying==1.3.4

From 22c3b88ba3f4861d7ced040ead0e236abe6e615a Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:35:21 +0200
Subject: [PATCH 15/45] Upgrade Zimscraperlib to 3.1.1 + remove useless code

---
 kolibri2zim/scraper.py | 16 +++++++---------
 pyproject.toml         |  4 ++--
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 4c48923..6e52913 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -320,7 +320,7 @@ def add_topic_node(self, node_id):
             self.creator.add_item_for(
                 path=node_id,
                 title=node["title"],
-                content=html.encode(),
+                content=html,
                 mimetype="text/html",
             )
         logger.debug(f"Added topic #{node_id}")
@@ -441,7 +441,7 @@ def add_video_node(self, node_id):
             self.creator.add_item_for(
                 path=node_id,
                 title=node["title"],
-                content=html.encode(),
+                content=html,
                 mimetype="text/html",
             )
         logger.debug(f"Added video #{node_id}")
@@ -544,7 +544,7 @@ def add_audio_node(self, node_id):
             self.creator.add_item_for(
                 path=node_id,
                 title=node["title"],
-                content=html.encode(),
+                content=html,
                 mimetype="text/html",
             )
         logger.debug(f"Added audio #{node_id}")
@@ -620,7 +620,7 @@ def add_exercise_node(self, node_id):
             self.creator.add_item_for(
                 path=node_id,
                 title=node["title"],
-                content=html.encode(),
+                content=html,
                 mimetype="text/html",
             )
         logger.debug(f"Added exercise node #{node_id}")
@@ -697,7 +697,7 @@ def target_for(file):
                 self.creator.add_item_for(
                     path=path,
                     title=node["title"],
-                    content=html.encode(),
+                    content=html,
                     mimetype="text/html",
                 )
         logger.debug(f"Added document #{node_id}")
@@ -1041,7 +1041,7 @@ def add_custom_about_and_css(self):
             self.creator.add_item_for(
                 path="about",
                 title=title,
-                content=html.encode(),
+                content=html,
                 mimetype="text/html",
             )
         del html
@@ -1060,7 +1060,5 @@ def add_custom_about_and_css(self):
         else:
             content = ""
 
-        self.creator.add_item_for(
-            "custom.css", content=content.encode(), mimetype="text/css"
-        )
+        self.creator.add_item_for("custom.css", content=content, mimetype="text/css")
         logger.debug("Added about page and custom CSS")
diff --git a/pyproject.toml b/pyproject.toml
index ca1854c..ebb50c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
     "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
 ]
 dependencies = [
-    "zimscraperlib==3.1.0",
+    "zimscraperlib==3.1.1",
     "kiwixstorage==0.8.3",
     "Jinja2==3.1.2",
     "pif==0.8.2",
@@ -53,7 +53,7 @@ exclude = ["/.github"]
 [tool.hatch.build.hooks.custom]
 enable-by-default = false
 path = "hatch_build.py"
-dependencies = ["zimscraperlib==3.1.0"]
+dependencies = ["zimscraperlib==3.1.1"]
 
 [tool.hatch.envs.default]
 features = ["dev"]

From fc7c08583950928ea8423cf8f565b6aaa27c79ca Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:36:02 +0200
Subject: [PATCH 16/45] Fix computation of nb_threads and nb_processes

---
 kolibri2zim/scraper.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 6e52913..181f60c 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -100,12 +100,6 @@ def go(option):
             else:
                 return None
 
-        def gom(option):
-            res = go(option)
-            if not res:
-                raise Exception(f"Unexpected kind of option for {option}")
-            return res
-
         self.channel_id = go("channel_id")
         self.root_id = go("root_id")
 
@@ -139,8 +133,10 @@ def gom(option):
         self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir")))
 
         # performances options
-        self.nb_threads = int(gom("threads"))
-        self.nb_processes = int(gom("processes"))
+        nb_threads_str = go("threads")
+        self.nb_threads = int(nb_threads_str) if nb_threads_str else None
+        nb_processes_str = go("processes")
+        self.nb_processes = int(nb_processes_str) if nb_processes_str else None
         self.s3_url_with_credentials = go("s3_url_with_credentials")
         self.s3_storage = None
         self.dedup_html_files = go("dedup_html_files")

From e35f35e04d633223d60a13e847adfb96f5baad47 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:36:36 +0200
Subject: [PATCH 17/45] Fail early if it looks like JS dependencies are not
 available

---
 kolibri2zim/scraper.py | 68 +++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 18 deletions(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 181f60c..4731f36 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -7,6 +7,7 @@
 import hashlib
 import io
 import json
+import os
 import shutil
 import tempfile
 import threading
@@ -751,24 +752,7 @@ def add_html5_node(self, node_id):
         logger.debug(f"Added HTML5 node #{node_id}")
 
     def run(self):
-        if self.s3_url_with_credentials and not self.s3_credentials_ok():
-            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
-
-        s3_msg = (
-            f"  using cache: {self.s3_storage.url.netloc} "
-            f"with bucket: {self.s3_storage.bucket_name}"
-            if self.s3_storage
-            else ""
-        )
-        logger.info(
-            f"Starting scraper with:\n"
-            f"  channel_id: {self.channel_id}\n"
-            f"  build_dir: {self.build_dir}\n"
-            f"  output_dir: {self.output_dir}\n"
-            f"  using webm : {self.use_webm}\n"
-            f"  low_quality : {self.low_quality}\n"
-            f"{s3_msg}"
-        )
+        self.ensure_js_deps_are_present()
 
         logger.info("Download database")
         self.download_db()
@@ -1058,3 +1042,51 @@ def add_custom_about_and_css(self):
 
         self.creator.add_item_for("custom.css", content=content, mimetype="text/css")
         logger.debug("Added about page and custom CSS")
+
+    def ensure_js_deps_are_present(self):
+        for js_deps_file in [
+            "epub.min.js",
+            "jszip.min.js",
+            "jquery.min.js",
+            "videojs-ogvjs.js",
+        ]:
+            if not os.path.exists(
+                self.templates_dir.joinpath(f"assets/{js_deps_file}")
+            ):
+                raise ValueError(
+                    "It looks like JS deps have not been installed,"
+                    f" {js_deps_file} is missing"
+                )
+
+        for js_deps_dir in [
+            "pdfjs",
+            "videojs",
+            "ogvjs",
+            "bootstrap",
+            "bootstrap-icons",
+            "perseus",
+        ]:
+            if not os.path.exists(self.templates_dir.joinpath(f"assets/{js_deps_dir}")):
+                raise ValueError(
+                    "It looks like JS deps have not been installed,"
+                    f" {js_deps_dir} is missing"
+                )
+
+        if self.s3_url_with_credentials and not self.s3_credentials_ok():
+            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
+
+        s3_msg = (
+            f"  using cache: {self.s3_storage.url.netloc} "
+            f"with bucket: {self.s3_storage.bucket_name}"
+            if self.s3_storage
+            else ""
+        )
+        logger.info(
+            f"Starting scraper with:\n"
+            f"  channel_id: {self.channel_id}\n"
+            f"  build_dir: {self.build_dir}\n"
+            f"  output_dir: {self.output_dir}\n"
+            f"  using webm : {self.use_webm}\n"
+            f"  low_quality : {self.low_quality}\n"
+            f"{s3_msg}"
+        )

From 20d6102201498bc2e0a95f177b32c257b21aaa2b Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:56:47 +0200
Subject: [PATCH 18/45] Revert useless modifications

---
 kolibri2zim/database.py | 11 +----------
 kolibri2zim/scraper.py  |  6 +-----
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/kolibri2zim/database.py b/kolibri2zim/database.py
index 99b5541..7ecfa6e 100644
--- a/kolibri2zim/database.py
+++ b/kolibri2zim/database.py
@@ -208,16 +208,7 @@ def get_node_files(self, node_id, *, thumbnail=False):
             "ORDER BY priority ASC",
             (node_id, 1, 1 if thumbnail else 0),
         ):
-            yield {
-                "id": row["id"],
-                "fid": row["fid"],
-                "ext": row["ext"],
-                "prio": row["prio"],
-                "supp": row["supp"],
-                "checksum": row["checksum"],
-                "lang": row["lang"],
-                "preset": row["preset"],
-            }
+            yield dict(row)
 
     def get_node_thumbnail(self, node_id):
         return self.get_node_file(node_id, thumbnail=True)
diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 4731f36..bacddce 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -95,11 +95,7 @@ def __init__(self, **kwargs):
                 raise ValueError(f"Missing parameter `{option}`")
 
         def go(option):
-            res = kwargs.get(option)
-            if type(res) is str:
-                return res
-            else:
-                return None
+            return kwargs.get(option)
 
         self.channel_id = go("channel_id")
         self.root_id = go("root_id")

From 41ccf9388c8c65d336149a29e1cecc96baf44abb Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 11:06:54 +0200
Subject: [PATCH 19/45] Rewrite tmp_dir usage to avoid many calls to go()
 function

---
 dump_channel_to_fs.py  | 24 ++++------------
 kolibri2zim/scraper.py | 64 +++++++++++++++++++++---------------------
 2 files changed, 37 insertions(+), 51 deletions(-)

diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py
index 4ed042c..ad32cc0 100755
--- a/dump_channel_to_fs.py
+++ b/dump_channel_to_fs.py
@@ -144,27 +144,13 @@ def on_success(result):
     logger.info("Done downloading files")
 
 
-CHANNEL_ID_POS_IN_ARGV = 2
-BUILD_DIR_POS_IN_ARGV = 3
-FORCE_POS_IN_ARGV = 4
-
 if __name__ == "__main__":
-    if len(sys.argv) < CHANNEL_ID_POS_IN_ARGV:
+    args = [sys.argv[idx] if len(sys.argv) >= idx + 1 else None for idx in range(4)]
+    _, channel_id, build_dir, force = args
+
+    if not channel_id:
         logger.error("Missing channel ID")
         sys.exit(1)
-
-    channel_id = sys.argv[1]
-    if len(sys.argv) >= BUILD_DIR_POS_IN_ARGV:
-        build_dir = sys.argv[2]
-    else:
-        build_dir = None
-    if len(sys.argv) >= FORCE_POS_IN_ARGV:
-        force = (
-            sys.argv[3].lower() == "true"
-            or sys.argv[3].lower() == "force"
-            or sys.argv[3].lower() == "yes"
-        )
-    else:
-        force = False
+    force = bool(str(force).lower() in ("true", "force", "yes"))
 
     dump(channel_id=channel_id, build_dir=build_dir, force=force)
diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index bacddce..b820982 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -13,7 +13,6 @@
 import threading
 import zipfile
 from pathlib import Path
-from typing import Any
 
 import jinja2
 from bs4 import BeautifulSoup
@@ -125,9 +124,10 @@ def go(option):
 
         # directory setup
         self.output_dir = Path(str(go("output_dir"))).expanduser().resolve()
-        if go("tmp_dir"):
-            Path(str(go("tmp_dir"))).mkdir(parents=True, exist_ok=True)
-        self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir")))
+        tmp_dir = go("tmp_dir")
+        if tmp_dir:
+            Path(tmp_dir).mkdir(parents=True, exist_ok=True)
+        self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir))
 
         # performances options
         nb_threads_str = go("threads")
@@ -273,11 +273,12 @@ def funnel_from_s3(self, file_id, path, checksum, preset):
 
         # add to zim
         with self.creator_lock:
-            self.creator.add_item_for(
-                path=path,
-                content=fileobj.read(),
-                mimetype=preset.mimetype,
-            )
+            kwargs = {
+                "path": path,
+                "fileobj": fileobj,
+                "mimetype": preset.mimetype,
+            }
+            self.creator.add_item_for(StaticItem(**kwargs))
         logger.debug(f"Added {path} from S3::{key}")
         return True
 
@@ -326,23 +327,23 @@ def add_video_node(self, node_id):
         subtitle files (`video_subtitle`) are VTT files and are only limited by the
         number of language to select from in kolibri studio"""
 
-        files = list(self.db.get_node_files(node_id, thumbnail=False))
-        if len(files) == 0:
+        files = self.db.get_node_files(node_id, thumbnail=False)
+        if not files:
             return
         files = sorted(files, key=lambda f: f["prio"])
-        it: list[dict[str, Any]] = list(filter(lambda f: f["supp"] == 0, files))
-
-        if len(it) == 0:
+        it = filter(lambda f: f["supp"] == 0, files)
+        try:
+            # find main video file
+            video_file = next(it)
+        except StopIteration:
             # we have no video file
             return
-        elif len(it) == 1:
+
+        try:
+            alt_video_file = next(it)
+        except StopIteration:
             # we have no supplementary video file (which is OK)
-            video_file = it[0]
             alt_video_file = None
-        else:
-            # we have video and alt video
-            video_file = it[0]
-            alt_video_file = it[1]
 
         # now decide which file to keep and what to do with it
 
@@ -445,10 +446,10 @@ def add_video_upon_completion(self, future):
         logs error in case of failure"""
         if future.cancelled():
             return
-        res = self.videos_futures.get(future)
-        if not res:
+        try:
+            src_fname, dst_fpath, path = self.videos_futures[future]
+        except KeyError:
             return
-        src_fname, dst_fpath, path = res
 
         try:
             future.result()
@@ -507,10 +508,10 @@ def request_s3_upload_and_removal(self, item):
         """add file from item to uploads list"""
         path = item.path
         del item
-        res = self.pending_upload.get(path)
-        if not res:
+        try:
+            dest_fpath, key, meta = self.pending_upload[path]
+        except KeyError:
             return
-        dest_fpath, key, meta = res
         # TODO: submit to a thread executor (to create) instead
         # this is currently called on main-tread.
         self.upload_to_s3(key, dest_fpath, **meta)
@@ -908,17 +909,16 @@ def sanitize_inputs(self):
         period = datetime.datetime.now(datetime.UTC).strftime("%Y-%m")
         if self.fname:
             # make sure we were given a filename and not a path
-            fname_path = Path(self.fname.format(period=period)).resolve()
-            if not fname_path.is_file():
-                raise ValueError(f"filename is not a filename: {self.fname}")
+            fname_path = Path(str(self.fname).format(period=period))
+            if Path(fname_path.name) != fname_path:
+                raise ValueError(f"filename is not a filename: {fname_path}")
             self.clean_fname = str(fname_path)
         else:
             self.clean_fname = f"{self.name}_{period}.zim"
 
         if not self.title:
-            self.clean_title = channel_meta["name"].strip()
-        else:
-            self.clean_title = self.title.strip()
+            self.title = channel_meta["name"]
+        self.title = self.title.strip()
 
         if not self.description:
             self.description = channel_meta["description"]

From a233ee85a7fa50a6eb98e6237bc046f0224365d4 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 14:21:26 +0200
Subject: [PATCH 20/45] Only one read_from_zip makes more sense

---
 kolibri2zim/scraper.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index b820982..cfcec14 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -79,14 +79,10 @@ def get_kolibri_url_for(file_id: str, ext: str):
     return f"{STUDIO_URL}/content/storage/{remote_path}", fname
 
 
-def read_from_zip_as_bytes(ark, member):
+def read_from_zip(ark, member):
     return ark.open(member).read()
 
 
-def read_from_zip_as_text(ark, member):
-    return read_from_zip_as_bytes(ark, member).decode("utf-8")
-
-
 class Kolibri2Zim:
     def __init__(self, **kwargs):
         for option in options:
@@ -571,7 +567,7 @@ def add_exercise_node(self, node_id):
         if manifest_name not in zip_ark.namelist():
             logger.error(f"Excercise node without {manifest_name}")
             return
-        manifest = json.loads(read_from_zip_as_bytes(zip_ark, manifest_name))
+        manifest = json.loads(read_from_zip(zip_ark, manifest_name))
 
         # copy exercise content, rewriting internal paths
         # all internal resources to be stored under {node_id}/ prefix
@@ -579,7 +575,7 @@ def add_exercise_node(self, node_id):
         for assessment_item in manifest.get("all_assessment_items", []):
             item_path = f"{assessment_item}.json"
             if item_path in zip_ark.namelist():
-                perseus_content = read_from_zip_as_text(zip_ark, item_path)
+                perseus_content = read_from_zip(zip_ark, item_path).decode("utf-8")
                 perseus_content = perseus_content.replace(
                     r"web+graphie:${☣ LOCALPATH}", f"web+graphie:./{node_id}"
                 )
@@ -598,7 +594,7 @@ def add_exercise_node(self, node_id):
                 self.creator.add_item_for(
                     path=path,
                     title="",
-                    content=read_from_zip_as_bytes(zip_ark, ark_member),
+                    content=read_from_zip(zip_ark, ark_member),
                 )
             logger.debug(f"Added exercise support file {path}")
 

From 7d408ddcff6d527fb636b44f12b248e933a3ef17 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 14:21:46 +0200
Subject: [PATCH 21/45] Read directly from the files

---
 kolibri2zim/scraper.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index cfcec14..af2b073 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -992,16 +992,15 @@ def add_custom_about_and_css(self):
                 title = channel_meta["name"]
                 content = None
             else:
-                with open(user_provided_file) as fh:
-                    soup = BeautifulSoup(fh.read(), "lxml")
-                    title = soup.find("title")
-                    if not title:
-                        raise Exception("Failed to extract title")
-                    title = title.text
-                    content = soup.select("body > .container")
-                    # we're only interested in the first one
-                    if isinstance(content, list):
-                        content = content[0]
+                soup = BeautifulSoup(user_provided_file.read_bytes(), "lxml")
+                title = soup.find("title")
+                if not title:
+                    raise Exception("Failed to extract title")
+                title = title.text
+                content = soup.select("body > .container")
+                # we're only interested in the first one
+                if isinstance(content, list):
+                    content = content[0]
         else:
             title = channel_meta["name"]
             content = None
@@ -1026,8 +1025,7 @@ def add_custom_about_and_css(self):
             if not user_provided_file:
                 content = ""
             else:
-                with open(user_provided_file) as fh:
-                    content = fh.read()
+                content = user_provided_file.read_bytes()
         # otherwise, create a blank one
         else:
             content = ""

From fb341f70b36a8d1a77779930c937a3bfdfa4166c Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 14:30:10 +0200
Subject: [PATCH 22/45] Use major Github actions versions

---
 .github/workflows/publish.yml | 8 ++++----
 .github/workflows/qa.yml      | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index ef88218..432ffc8 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -11,12 +11,12 @@ jobs:
       id-token: write # mandatory for PyPI trusted publishing
 
     steps:
-      - uses: actions/checkout@v3.5.3
+      - uses: actions/checkout@v3
 
       - name: Set up Python 3.11
-        uses: actions/setup-python@v4.6.1
+        uses: actions/setup-python@v4
         with:
-          python-version-file: "pyproject.toml"
+          python-version-file: pyproject.toml
           architecture: x64
 
       - name: Build packages
@@ -25,7 +25,7 @@ jobs:
           python -m build sdist wheel
 
       - name: Upload to PyPI
-        uses: pypa/gh-action-pypi-publish@v1.8.6
+        uses: pypa/gh-action-pypi-publish@release/v1.8
 
       - name: Build and push Docker image
         uses: openzim/docker-publish-action@v10
diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml
index abf892b..ec036c0 100644
--- a/.github/workflows/qa.yml
+++ b/.github/workflows/qa.yml
@@ -8,12 +8,12 @@ jobs:
     runs-on: ubuntu-22.04
 
     steps:
-      - uses: actions/checkout@v3.5.3
+      - uses: actions/checkout@v3
 
       - name: Set up Python 3.11
-        uses: actions/setup-python@v4.6.1
+        uses: actions/setup-python@v4
         with:
-          python-version-file: 'pyproject.toml'
+          python-version-file: pyproject.toml
           architecture: x64
 
       - name: Install dependencies (and project)

From dae158fce6dcc0155d99585edb398224bd5985ba Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 14:36:41 +0200
Subject: [PATCH 23/45] Add missing 'check' dependency to CI

---
 .github/workflows/qa.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml
index ec036c0..07978d3 100644
--- a/.github/workflows/qa.yml
+++ b/.github/workflows/qa.yml
@@ -19,7 +19,7 @@ jobs:
       - name: Install dependencies (and project)
         run: |
           pip install -U pip
-          pip install -e .[lint,scripts,test]
+          pip install -e .[lint,check,scripts,test]
 
       - name: Check black formatting
         run: inv lint-black

From 7c83e231c67fe9d788887e10ae9ebffaaabd81a1 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 14:37:52 +0200
Subject: [PATCH 24/45] Fix missleading names in CI

---
 .github/workflows/publish.yml | 2 +-
 .github/workflows/qa.yml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 432ffc8..0f96987 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -13,7 +13,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
-      - name: Set up Python 3.11
+      - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version-file: pyproject.toml
diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml
index 07978d3..54c93eb 100644
--- a/.github/workflows/qa.yml
+++ b/.github/workflows/qa.yml
@@ -10,7 +10,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
-      - name: Set up Python 3.11
+      - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version-file: pyproject.toml

From 6552dba1e5bc0ac92b34403ba1abab4ee13f7737 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 14:40:32 +0200
Subject: [PATCH 25/45] Fix publishing CI

---
 .github/workflows/publish.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 0f96987..7fdc962 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,8 +21,8 @@ jobs:
 
       - name: Build packages
         run: |
-          pip install -U pip build
-          python -m build sdist wheel
+          pip install -U pip hatch
+          HATCH_BUILD_HOOKS_ENABLE=true hatch build
 
       - name: Upload to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1.8

From 2d618509e0a28cab461ecda24283c6d87badd977 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 15:01:09 +0200
Subject: [PATCH 26/45] Fix favicon / illustrations handling to not downscale
 then upscale

---
 kolibri2zim/scraper.py | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index af2b073..54bc892 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -797,7 +797,7 @@ def run(self):
             Creator=self.author,
             Publisher=self.publisher,
             Date=datetime.datetime.now(datetime.UTC).strftime("%Y-%d-%m"),
-            Illustration_48x48_at_1=self.favicon_fpath.read_bytes(),
+            Illustration_48x48_at_1=self.favicon_48_fpath.read_bytes(),
         )
         self.creator.start()
 
@@ -957,27 +957,23 @@ def retrieve_favicon(self):
                 )
 
         # convert to PNG (might already be PNG but it's OK)
-        favicon_fpath = favicon_orig.with_suffix(".png")
-        convert_image(favicon_orig, favicon_fpath)
+        self.favicon_48_fpath = favicon_orig.with_suffix(".48.png")
+        convert_image(favicon_orig, self.favicon_48_fpath)
 
-        # resize to appropriate size (ZIM uses 48x48 so we double for retina)
-        for size in (96, 48):
-            resize_image(favicon_fpath, width=size, height=size, method="thumbnail")
-            with open(favicon_fpath, "rb") as fh:
-                self.creator.add_illustration(size, fh.read())
+        self.favicon_96_fpath = favicon_orig.with_suffix(".96.png")
+        convert_image(favicon_orig, self.favicon_96_fpath)
 
-        # resize to appropriate size (ZIM uses 48x48)
-        resize_image(favicon_fpath, width=96, height=96, method="thumbnail")
+        # resize to appropriate size (ZIM uses 48x48 so we double for retina)
+        resize_image(self.favicon_48_fpath, width=48, height=48, method="thumbnail")
+        resize_image(self.favicon_96_fpath, width=96, height=96, method="thumbnail")
 
         # generate favicon
-        favicon_ico_path = favicon_fpath.with_suffix(".ico")
-        create_favicon(src=favicon_fpath, dst=favicon_ico_path)
-
-        self.favicon_fpath = favicon_fpath
-        self.favicon_ico_path = favicon_ico_path
+        self.favicon_ico_path = favicon_orig.with_suffix(".ico")
+        create_favicon(src=self.favicon_96_fpath, dst=self.favicon_ico_path)
 
     def add_favicon(self):
-        self.creator.add_item_for("favicon.png", fpath=self.favicon_fpath)
+        self.creator.add_illustration(96, self.favicon_96_fpath.read_bytes())
+        self.creator.add_item_for("favicon.png", fpath=self.favicon_96_fpath)
         self.creator.add_item_for("favicon.ico", fpath=self.favicon_ico_path)
 
     def add_custom_about_and_css(self):

From f48532b349506e3956ad98a3d40a7719511c975c Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 15:10:02 +0200
Subject: [PATCH 27/45] Truncate description and add truncated long description

---
 kolibri2zim/scraper.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 54bc892..3037e77 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -18,6 +18,10 @@
 from bs4 import BeautifulSoup
 from kiwixstorage import KiwixStorage
 from pif import get_public_ip
+from zimscraperlib.constants import (
+    MAXIMUM_DESCRIPTION_METADATA_LENGTH,
+    MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
+)
 from zimscraperlib.filesystem import get_file_mimetype
 from zimscraperlib.i18n import find_language_names
 from zimscraperlib.image.convertion import convert_image, create_favicon
@@ -793,7 +797,16 @@ def run(self):
             Name=self.clean_fname,
             Language="eng",
             Title=self.title,
-            Description=self.description,
+            Description=(
+                f"{self.description[0:MAXIMUM_DESCRIPTION_METADATA_LENGTH-4]} ..."
+                if len(self.description) > MAXIMUM_DESCRIPTION_METADATA_LENGTH
+                else self.description
+            ),
+            LongDescription=(
+                f"{self.description[0:MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH-4]} ..."
+                if len(self.description) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
+                else self.description
+            ),
             Creator=self.author,
             Publisher=self.publisher,
             Date=datetime.datetime.now(datetime.UTC).strftime("%Y-%d-%m"),

From c28dc1254d60f2a8ed372d840bc9f0486e645d01 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 15:10:41 +0200
Subject: [PATCH 28/45] zimscraperlib now supports datetime + this avoids
 formating issue

---
 kolibri2zim/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 3037e77..37bb764 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -809,7 +809,7 @@ def run(self):
             ),
             Creator=self.author,
             Publisher=self.publisher,
-            Date=datetime.datetime.now(datetime.UTC).strftime("%Y-%d-%m"),
+            Date=datetime.datetime.now(datetime.UTC),
             Illustration_48x48_at_1=self.favicon_48_fpath.read_bytes(),
         )
         self.creator.start()

From d755683781e36862c55ca31a98a9fa53511b2e6e Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 15:28:24 +0200
Subject: [PATCH 29/45] Images must be squared

---
 kolibri2zim/scraper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 37bb764..6b8d54d 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -977,8 +977,8 @@ def retrieve_favicon(self):
         convert_image(favicon_orig, self.favicon_96_fpath)
 
         # resize to appropriate size (ZIM uses 48x48 so we double for retina)
-        resize_image(self.favicon_48_fpath, width=48, height=48, method="thumbnail")
-        resize_image(self.favicon_96_fpath, width=96, height=96, method="thumbnail")
+        resize_image(self.favicon_48_fpath, width=48, height=48, method="contain")
+        resize_image(self.favicon_96_fpath, width=96, height=96, method="contain")
 
         # generate favicon
         self.favicon_ico_path = favicon_orig.with_suffix(".ico")

From 977aaffcbd00623ac8c835f47961039e442b8ce6 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 21 Jul 2023 15:28:51 +0200
Subject: [PATCH 30/45] Fix changelog

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bfc2123..adbabde 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,12 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 - Fix issue with ePub rendering which was outside the iframe
+- Description is now limited to expected lenght and long description is set
+- Icons and illustrations are squared as expected
 - Many small fixes (including some bugs) detected by ruff / pyright
 
 ### Changed
 
 - Migrate to our new Python standard (hatch, ruff, pyright, ...)
-- Using zimscraperlib 3.1.0
+- Using zimscraperlib 3.1.1
 - Updated image to `python:3.11-bullseye`
 - Retry video reencoding up to three times
 - Move inline javascript to dedicated files

From 7186e5f44e4d1d2a54db1df15aa002ee0dce5f54 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 13:54:19 +0200
Subject: [PATCH 31/45] Enhance ZIM description handling and add support long
 description

---
 kolibri2zim/entrypoint.py |  5 ++++
 kolibri2zim/scraper.py    | 51 ++++++++++++++++++++++++++++-----------
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/kolibri2zim/entrypoint.py b/kolibri2zim/entrypoint.py
index 98576e8..856bea2 100755
--- a/kolibri2zim/entrypoint.py
+++ b/kolibri2zim/entrypoint.py
@@ -47,6 +47,11 @@ def main():
         help="Custom description for your ZIM. Kolibri channel description otherwise",
     )
 
+    parser.add_argument(
+        "--long-description",
+        help="Custom long description for your ZIM, optional",
+    )
+
     parser.add_argument(
         "--favicon",
         help="URL/path for Favicon. Kolibri channel thumbnail otherwise "
diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
index 6b8d54d..b1fb7fa 100644
--- a/kolibri2zim/scraper.py
+++ b/kolibri2zim/scraper.py
@@ -19,8 +19,10 @@
 from kiwixstorage import KiwixStorage
 from pif import get_public_ip
 from zimscraperlib.constants import (
-    MAXIMUM_DESCRIPTION_METADATA_LENGTH,
-    MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
+    MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH,
+)
+from zimscraperlib.constants import (
+    MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH,
 )
 from zimscraperlib.filesystem import get_file_mimetype
 from zimscraperlib.i18n import find_language_names
@@ -50,6 +52,7 @@
     "fname",
     "title",
     "description",
+    "long_description",
     "creator",
     "publisher",
     "tags",
@@ -113,6 +116,7 @@ def go(option):
             self.tags = [t.strip() for t in tags.split(",")]
         self.title = go("title")
         self.description = go("description")
+        self.long_description = go("long_description")
         self.author = go("creator")
         self.publisher = go("publisher")
         self.name = go("name")
@@ -797,16 +801,8 @@ def run(self):
             Name=self.clean_fname,
             Language="eng",
             Title=self.title,
-            Description=(
-                f"{self.description[0:MAXIMUM_DESCRIPTION_METADATA_LENGTH-4]} ..."
-                if len(self.description) > MAXIMUM_DESCRIPTION_METADATA_LENGTH
-                else self.description
-            ),
-            LongDescription=(
-                f"{self.description[0:MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH-4]} ..."
-                if len(self.description) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
-                else self.description
-            ),
+            Description=self.description,
+            LongDescription=self.long_description,
             Creator=self.author,
             Publisher=self.publisher,
             Date=datetime.datetime.now(datetime.UTC),
@@ -930,8 +926,35 @@ def sanitize_inputs(self):
         self.title = self.title.strip()
 
         if not self.description:
-            self.description = channel_meta["description"]
-        self.description = self.description.strip()
+            # User did not provided a description, we will infer it from channel
+            # metadata, limited to maximum length
+            if self.long_description:
+                raise ValueError(
+                    "long_description cannot be set if description is not set"
+                )
+            self.description = channel_meta["description"].strip()
+            if len(self.description) > MAX_DESC_LENGTH:
+                self.long_description = self.description
+                self.description = f"{self.description[0:MAX_DESC_LENGTH-1]}…"
+                if len(self.long_description > MAX_LONG_DESC_LENGTH):
+                    self.long_description = (
+                        f"{self.long_description[0:MAX_LONG_DESC_LENGTH-1]}…"
+                    )
+        else:
+            self.description = self.description.strip()
+            if len(self.description) > MAX_DESC_LENGTH:
+                raise ValueError(
+                    f"description is too long ({len(self.description)}"
+                    f">{MAX_DESC_LENGTH})"
+                )
+            if (
+                self.long_description
+                and len(self.long_description) > MAX_LONG_DESC_LENGTH
+            ):
+                raise ValueError(
+                    f"long_description is too long ({len(self.long_description)}"
+                    f">{MAX_LONG_DESC_LENGTH})"
+                )
 
         if not self.author:
             self.author = channel_meta["author"] or "Kolibri"

From 2000d7e6c86e0159ad2b7410ed64f847a3df78ea Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 15:24:04 +0200
Subject: [PATCH 32/45] Add tests for description/long-description computations
 (including folder layout changes to accomodate for tests)

---
 .github/workflows/Tests.yml                   |  38 ++++
 .gitignore                                    |  20 +-
 get_js_deps.sh                                |   2 +-
 .../templates/assets/perseus_exercise.js      |   1 -
 pyproject.toml                                |  49 ++++-
 {kolibri2zim => src/kolibri2zim}/__about__.py |   0
 {kolibri2zim => src/kolibri2zim}/__init__.py  |   0
 {kolibri2zim => src/kolibri2zim}/__main__.py  |   0
 {kolibri2zim => src/kolibri2zim}/constants.py |   0
 {kolibri2zim => src/kolibri2zim}/database.py  |   0
 {kolibri2zim => src/kolibri2zim}/debug.py     |   0
 .../kolibri2zim}/entrypoint.py                |   0
 {kolibri2zim => src/kolibri2zim}/nodes.py     |   0
 .../kolibri2zim}/processing.py                |   0
 {kolibri2zim => src/kolibri2zim}/scraper.py   |   2 +-
 .../kolibri2zim}/templates/about.html         |   0
 .../kolibri2zim}/templates/assets/document.js |   2 +-
 .../templates/assets/epub_embed.css           |   2 +-
 .../templates/assets/epub_embed.html          |   0
 .../templates/assets/epub_embed.js            |   2 +-
 .../templates/assets/perseus_exercise.js      |   1 +
 .../kolibri2zim}/templates/audio.html         |   0
 .../kolibri2zim}/templates/base.html          |   0
 .../kolibri2zim}/templates/card.html          |   0
 .../kolibri2zim}/templates/document.html      |   2 +-
 .../kolibri2zim}/templates/epub.html          |   2 -
 .../kolibri2zim}/templates/kolibri-logo.png   | Bin
 .../kolibri2zim}/templates/node_meta.html     |   0
 .../templates/perseus_exercise.html           |   0
 .../kolibri2zim}/templates/topic.html         |   0
 .../kolibri2zim}/templates/video.html         |   0
 tasks.py                                      |  26 +++
 tests/conftest.py                             |  60 ++++++
 tests/test_sanitize_inputs.py                 | 193 ++++++++++++++++++
 34 files changed, 379 insertions(+), 23 deletions(-)
 create mode 100644 .github/workflows/Tests.yml
 delete mode 100644 kolibri2zim/templates/assets/perseus_exercise.js
 rename {kolibri2zim => src/kolibri2zim}/__about__.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/__init__.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/__main__.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/constants.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/database.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/debug.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/entrypoint.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/nodes.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/processing.py (100%)
 rename {kolibri2zim => src/kolibri2zim}/scraper.py (99%)
 rename {kolibri2zim => src/kolibri2zim}/templates/about.html (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/assets/document.js (94%)
 rename {kolibri2zim => src/kolibri2zim}/templates/assets/epub_embed.css (99%)
 rename {kolibri2zim => src/kolibri2zim}/templates/assets/epub_embed.html (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/assets/epub_embed.js (94%)
 create mode 100644 src/kolibri2zim/templates/assets/perseus_exercise.js
 rename {kolibri2zim => src/kolibri2zim}/templates/audio.html (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/base.html (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/card.html (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/document.html (99%)
 rename {kolibri2zim => src/kolibri2zim}/templates/epub.html (99%)
 rename {kolibri2zim => src/kolibri2zim}/templates/kolibri-logo.png (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/node_meta.html (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/perseus_exercise.html (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/topic.html (100%)
 rename {kolibri2zim => src/kolibri2zim}/templates/video.html (100%)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_sanitize_inputs.py

diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml
new file mode 100644
index 0000000..53ace14
--- /dev/null
+++ b/.github/workflows/Tests.yml
@@ -0,0 +1,38 @@
+name: Tests
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  run-tests:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version-file: pyproject.toml
+          architecture: x64
+
+      - name: Install dependencies (and project)
+        run: |
+          pip install -U pip
+          pip install -e .[test,scripts]
+
+      - name: Run the tests
+        run: inv coverage --args "-vvv"
+
+      - name: Upload coverage report to codecov
+        uses: codecov/codecov-action@v3
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Ensure we can build targets
+        run: |
+          pip install build
+          python3 -m build
diff --git a/.gitignore b/.gitignore
index d4c96ac..db971bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -367,17 +367,17 @@ pyrightconfig.json
 
 
 # assets that we download
-kolibri2zim/templates/assets/bootstrap/
-kolibri2zim/templates/assets/pdfjs/
-kolibri2zim/templates/assets/videojs/
-kolibri2zim/templates/assets/jquery.min.js
-kolibri2zim/templates/assets/ogvjs/
-kolibri2zim/templates/assets/videojs-ogvjs.js
 .dockerignore
-kolibri2zim/templates/assets/epub.min.js
-kolibri2zim/templates/assets/bootstrap-icons/
-kolibri2zim/templates/assets/jszip.min.js
-kolibri2zim/templates/assets/perseus/
+src/kolibri2zim/templates/assets/bootstrap/
+src/kolibri2zim/templates/assets/pdfjs/
+src/kolibri2zim/templates/assets/videojs/
+src/kolibri2zim/templates/assets/jquery.min.js
+src/kolibri2zim/templates/assets/ogvjs/
+src/kolibri2zim/templates/assets/videojs-ogvjs.js
+src/kolibri2zim/templates/assets/epub.min.js
+src/kolibri2zim/templates/assets/bootstrap-icons/
+src/kolibri2zim/templates/assets/jszip.min.js
+src/kolibri2zim/templates/assets/perseus/
 
 # output dir
 output
diff --git a/get_js_deps.sh b/get_js_deps.sh
index 9936269..8954e5b 100755
--- a/get_js_deps.sh
+++ b/get_js_deps.sh
@@ -23,7 +23,7 @@ fi
 
 # Absolute path this script is in.
 SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )"
-ASSETS_PATH="${SCRIPT_PATH}/kolibri2zim/templates/assets"
+ASSETS_PATH="${SCRIPT_PATH}/src/kolibri2zim/templates/assets"
 
 echo "About to download JS assets to ${ASSETS_PATH}"
 
diff --git a/kolibri2zim/templates/assets/perseus_exercise.js b/kolibri2zim/templates/assets/perseus_exercise.js
deleted file mode 100644
index dfbc7a7..0000000
--- a/kolibri2zim/templates/assets/perseus_exercise.js
+++ /dev/null
@@ -1 +0,0 @@
-less = { env: 'development', logLevel: 1 };
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index ebb50c0..6bd8c4f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,10 +29,15 @@ dynamic = ["version"]
 scripts = ["invoke==2.1.3"]
 lint = ["black==23.3.0", "ruff==0.0.272"]
 check = ["pyright==1.1.317"]
+test = [
+  "pytest==7.4.0",
+  "coverage==7.2.7",
+]
 dev = [
     "debugpy",
     "kolibri2zim[scripts]",
     "kolibri2zim[lint]",
+    "kolibri2zim[test]",
     "kolibri2zim[check]",
     "hatchling",
 ]
@@ -45,19 +50,27 @@ Donate = "https://www.kiwix.org/en/support-us/"
 kolibri2zim = "kolibri2zim:entrypoint.main"
 
 [tool.hatch.version]
-path = "kolibri2zim/__about__.py"
+path = "src/kolibri2zim/__about__.py"
 
 [tool.hatch.build]
 exclude = ["/.github"]
 
 [tool.hatch.build.hooks.custom]
-enable-by-default = false
 path = "hatch_build.py"
 dependencies = ["zimscraperlib==3.1.1"]
 
 [tool.hatch.envs.default]
 features = ["dev"]
 
+[tool.hatch.envs.test]
+features = ["scripts", "test"]
+
+[tool.hatch.envs.test.scripts]
+run = "inv test --args '{args}'"
+run-cov = "inv test-cov --args '{args}'"
+report-cov = "inv report-cov"
+coverage = "inv coverage --args '{args}'"
+
 [tool.hatch.envs.lint]
 template = "lint"
 python = "py311"
@@ -176,13 +189,41 @@ ban-relative-imports = "all"
 # Tests can use magic values, assertions, and relative imports
 "tests/**/*" = ["PLR2004", "S101", "TID252"]
 
+[tool.pytest.ini_options]
+minversion = "7.3"
+testpaths = ["tests"]
+pythonpath = [".", "src"]
+
+[tool.coverage.paths]
+great_project = ["src/kolibri2zim"]
+tests = ["tests"]
+
+[tool.coverage.run]
+source_pkgs = ["kolibri2zim"]
+branch = true
+parallel = true
+omit = [
+  "src/kolibri2zim/__about__.py",
+]
+
+[tool.coverage.report]
+exclude_lines = [
+  "no cov",
+  "if __name__ == .__main__.:",
+  "if TYPE_CHECKING:",
+]
+
 [tool.pyright]
 pythonVersion = "3.11"
 pythonPlatform = "All"
 typeCheckingMode = "basic"
 
-include = ["kolibri2zim"]
+include = ["src", "tests", "tasks.py"]
 exclude = ["**/node_modules",
     "**/__pycache__",
-    "kolibri2zim/templates",
+    "src/kolibri2zim/templates",
+]
+
+executionEnvironments= [
+    { root= "src" }
 ]
diff --git a/kolibri2zim/__about__.py b/src/kolibri2zim/__about__.py
similarity index 100%
rename from kolibri2zim/__about__.py
rename to src/kolibri2zim/__about__.py
diff --git a/kolibri2zim/__init__.py b/src/kolibri2zim/__init__.py
similarity index 100%
rename from kolibri2zim/__init__.py
rename to src/kolibri2zim/__init__.py
diff --git a/kolibri2zim/__main__.py b/src/kolibri2zim/__main__.py
similarity index 100%
rename from kolibri2zim/__main__.py
rename to src/kolibri2zim/__main__.py
diff --git a/kolibri2zim/constants.py b/src/kolibri2zim/constants.py
similarity index 100%
rename from kolibri2zim/constants.py
rename to src/kolibri2zim/constants.py
diff --git a/kolibri2zim/database.py b/src/kolibri2zim/database.py
similarity index 100%
rename from kolibri2zim/database.py
rename to src/kolibri2zim/database.py
diff --git a/kolibri2zim/debug.py b/src/kolibri2zim/debug.py
similarity index 100%
rename from kolibri2zim/debug.py
rename to src/kolibri2zim/debug.py
diff --git a/kolibri2zim/entrypoint.py b/src/kolibri2zim/entrypoint.py
similarity index 100%
rename from kolibri2zim/entrypoint.py
rename to src/kolibri2zim/entrypoint.py
diff --git a/kolibri2zim/nodes.py b/src/kolibri2zim/nodes.py
similarity index 100%
rename from kolibri2zim/nodes.py
rename to src/kolibri2zim/nodes.py
diff --git a/kolibri2zim/processing.py b/src/kolibri2zim/processing.py
similarity index 100%
rename from kolibri2zim/processing.py
rename to src/kolibri2zim/processing.py
diff --git a/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py
similarity index 99%
rename from kolibri2zim/scraper.py
rename to src/kolibri2zim/scraper.py
index b1fb7fa..1bf0fcb 100644
--- a/kolibri2zim/scraper.py
+++ b/src/kolibri2zim/scraper.py
@@ -936,7 +936,7 @@ def sanitize_inputs(self):
             if len(self.description) > MAX_DESC_LENGTH:
                 self.long_description = self.description
                 self.description = f"{self.description[0:MAX_DESC_LENGTH-1]}…"
-                if len(self.long_description > MAX_LONG_DESC_LENGTH):
+                if len(self.long_description) > MAX_LONG_DESC_LENGTH:
                     self.long_description = (
                         f"{self.long_description[0:MAX_LONG_DESC_LENGTH-1]}…"
                     )
diff --git a/kolibri2zim/templates/about.html b/src/kolibri2zim/templates/about.html
similarity index 100%
rename from kolibri2zim/templates/about.html
rename to src/kolibri2zim/templates/about.html
diff --git a/kolibri2zim/templates/assets/document.js b/src/kolibri2zim/templates/assets/document.js
similarity index 94%
rename from kolibri2zim/templates/assets/document.js
rename to src/kolibri2zim/templates/assets/document.js
index a0826a0..b085bf6 100644
--- a/kolibri2zim/templates/assets/document.js
+++ b/src/kolibri2zim/templates/assets/document.js
@@ -6,4 +6,4 @@ function resizeFrameToFullHeight(){
     frame.style.height = newHeight + 'px';
 }
 window.addEventListener('resize', resizeFrameToFullHeight, {capture: true});
-resizeFrameToFullHeight();
\ No newline at end of file
+resizeFrameToFullHeight();
diff --git a/kolibri2zim/templates/assets/epub_embed.css b/src/kolibri2zim/templates/assets/epub_embed.css
similarity index 99%
rename from kolibri2zim/templates/assets/epub_embed.css
rename to src/kolibri2zim/templates/assets/epub_embed.css
index 25aed6d..2a5e859 100644
--- a/kolibri2zim/templates/assets/epub_embed.css
+++ b/src/kolibri2zim/templates/assets/epub_embed.css
@@ -280,4 +280,4 @@ svg {
 #opener:hover {
     stroke: #777;
     fill: #777;
-}
\ No newline at end of file
+}
diff --git a/kolibri2zim/templates/assets/epub_embed.html b/src/kolibri2zim/templates/assets/epub_embed.html
similarity index 100%
rename from kolibri2zim/templates/assets/epub_embed.html
rename to src/kolibri2zim/templates/assets/epub_embed.html
diff --git a/kolibri2zim/templates/assets/epub_embed.js b/src/kolibri2zim/templates/assets/epub_embed.js
similarity index 94%
rename from kolibri2zim/templates/assets/epub_embed.js
rename to src/kolibri2zim/templates/assets/epub_embed.js
index 6e6c0fc..fd269c5 100644
--- a/kolibri2zim/templates/assets/epub_embed.js
+++ b/src/kolibri2zim/templates/assets/epub_embed.js
@@ -48,4 +48,4 @@ var params = URLSearchParams && new URLSearchParams(document.location.search.sub
     };
 
     rendition.on("keyup", keyListener);
-    document.addEventListener("keyup", keyListener, false);
\ No newline at end of file
+    document.addEventListener("keyup", keyListener, false);
diff --git a/src/kolibri2zim/templates/assets/perseus_exercise.js b/src/kolibri2zim/templates/assets/perseus_exercise.js
new file mode 100644
index 0000000..bce3f89
--- /dev/null
+++ b/src/kolibri2zim/templates/assets/perseus_exercise.js
@@ -0,0 +1 @@
+less = { env: 'development', logLevel: 1 };
diff --git a/kolibri2zim/templates/audio.html b/src/kolibri2zim/templates/audio.html
similarity index 100%
rename from kolibri2zim/templates/audio.html
rename to src/kolibri2zim/templates/audio.html
diff --git a/kolibri2zim/templates/base.html b/src/kolibri2zim/templates/base.html
similarity index 100%
rename from kolibri2zim/templates/base.html
rename to src/kolibri2zim/templates/base.html
diff --git a/kolibri2zim/templates/card.html b/src/kolibri2zim/templates/card.html
similarity index 100%
rename from kolibri2zim/templates/card.html
rename to src/kolibri2zim/templates/card.html
diff --git a/kolibri2zim/templates/document.html b/src/kolibri2zim/templates/document.html
similarity index 99%
rename from kolibri2zim/templates/document.html
rename to src/kolibri2zim/templates/document.html
index eec2b56..18dba86 100644
--- a/kolibri2zim/templates/document.html
+++ b/src/kolibri2zim/templates/document.html
@@ -49,7 +49,7 @@
 </ul>
 
 <iframe id="frame" width="100%" height="100%" src="{{ target }}" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe" allow="autoplay; fullscreen">
-  
+
 </iframe>
 {% endblock %}
 
diff --git a/kolibri2zim/templates/epub.html b/src/kolibri2zim/templates/epub.html
similarity index 99%
rename from kolibri2zim/templates/epub.html
rename to src/kolibri2zim/templates/epub.html
index 306ca09..00f949c 100644
--- a/kolibri2zim/templates/epub.html
+++ b/src/kolibri2zim/templates/epub.html
@@ -8,5 +8,3 @@
   <p>You should get an epub reader here someday. In the mean time, just <a href="{{ filename }}">Open EPUB directly</a></p>
 </body>
 </html>
-
-
diff --git a/kolibri2zim/templates/kolibri-logo.png b/src/kolibri2zim/templates/kolibri-logo.png
similarity index 100%
rename from kolibri2zim/templates/kolibri-logo.png
rename to src/kolibri2zim/templates/kolibri-logo.png
diff --git a/kolibri2zim/templates/node_meta.html b/src/kolibri2zim/templates/node_meta.html
similarity index 100%
rename from kolibri2zim/templates/node_meta.html
rename to src/kolibri2zim/templates/node_meta.html
diff --git a/kolibri2zim/templates/perseus_exercise.html b/src/kolibri2zim/templates/perseus_exercise.html
similarity index 100%
rename from kolibri2zim/templates/perseus_exercise.html
rename to src/kolibri2zim/templates/perseus_exercise.html
diff --git a/kolibri2zim/templates/topic.html b/src/kolibri2zim/templates/topic.html
similarity index 100%
rename from kolibri2zim/templates/topic.html
rename to src/kolibri2zim/templates/topic.html
diff --git a/kolibri2zim/templates/video.html b/src/kolibri2zim/templates/video.html
similarity index 100%
rename from kolibri2zim/templates/video.html
rename to src/kolibri2zim/templates/video.html
diff --git a/tasks.py b/tasks.py
index 424223b..3370b73 100644
--- a/tasks.py
+++ b/tasks.py
@@ -7,6 +7,32 @@
 use_pty = not os.getenv("CI", "")
 
 
+@task(optional=["args"], help={"args": "pytest additional arguments"})
+def test(ctx: Context, args: str | None = ""):
+    """run tests (without coverage)"""
+    ctx.run(f"pytest {args}", pty=use_pty)
+
+
+@task(optional=["args"], help={"args": "pytest additional arguments"})
+def test_cov(ctx: Context, args: str | None = ""):
+    """run test vith coverage"""
+    ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
+
+
+@task()
+def report_cov(ctx: Context):
+    """report coverage"""
+    ctx.run("coverage combine", warn=True, pty=use_pty)
+    ctx.run("coverage report --show-missing", pty=use_pty)
+
+
+@task(optional=["args"], help={"args": "pytest additional arguments"})
+def coverage(ctx: Context, args: str | None = ""):
+    """run tests and report coverage"""
+    test_cov(ctx, args)
+    report_cov(ctx)
+
+
 @task(
     optional=["args"], help={"args": "linting tools (black, ruff) additional arguments"}
 )
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..0210377
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,60 @@
+from collections.abc import Callable, Generator
+from typing import Any
+
+import pytest
+
+from kolibri2zim.scraper import Kolibri2Zim, KolibriDB
+from kolibri2zim.scraper import options as expected_options_keys
+
+CHANNEL_NAME = "channel_name"
+CHANNEL_DESCRIPTION = "a description"
+
+
+class FakeDb(KolibriDB):
+    def __init__(
+        self,
+        channel_name: str,
+        channel_description: str,
+        channel_author: str | None,
+    ):
+        self.channel_name = channel_name
+        self.channel_description = channel_description
+        self.channel_author = channel_author
+
+    def get_channel_metadata(self, _):
+        return {
+            "name": self.channel_name,
+            "description": self.channel_description,
+            "author": self.channel_author,
+        }
+
+
+@pytest.fixture()
+def scraper_generator() -> Generator[Callable[..., Kolibri2Zim], None, None]:
+    def _scraper(
+        channel_name: str = CHANNEL_NAME,
+        channel_description: str = CHANNEL_DESCRIPTION,
+        channel_author: str | None = None,
+        additional_options: dict[str, Any] = {},
+    ) -> Kolibri2Zim:
+        options = {}
+        for option_key in expected_options_keys:
+            options[option_key] = None
+        options.update(additional_options)
+        scraper = Kolibri2Zim(**options)
+        scraper.db = FakeDb(
+            channel_author=channel_author,
+            channel_description=channel_description,
+            channel_name=channel_name,
+        )
+        return scraper
+
+    yield _scraper
+
+
+# @pytest.fixture
+# def default_options() -> Generator[dict[str, Any], None, None]:
+#     default_options = {}
+#     for option in options:
+#         default_options[option] = None
+#     yield default_options
diff --git a/tests/test_sanitize_inputs.py b/tests/test_sanitize_inputs.py
new file mode 100644
index 0000000..1504c12
--- /dev/null
+++ b/tests/test_sanitize_inputs.py
@@ -0,0 +1,193 @@
+import random
+import string
+from collections.abc import Callable
+
+import pytest
+from zimscraperlib.constants import MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LEN
+from zimscraperlib.constants import (
+    MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LEN,
+)
+
+from kolibri2zim.scraper import Kolibri2Zim
+
+
+def randomword(length):
+    letters = string.ascii_lowercase
+    return "".join(random.choice(letters) for i in range(length))  # noqa: S311
+
+
+def test_sanitize_defaults_ok(scraper_generator: Callable[..., Kolibri2Zim]):
+    scraper = scraper_generator()
+    scraper.sanitize_inputs()
+
+
+TEXT_NOT_USED = "text not used"
+
+LONG_TEXT = (
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
+    "incididunt ut labore et dolore magna aliqua. At erat pellentesque adipiscing "
+    "commodo elit at imperdiet. Rutrum tellus pellentesque eu tincidunt tortor aliquam"
+    " nulla facilisi. Eget lorem dolor sed viverra ipsum nunc. Ipsum nunc aliquet "
+    "bibendum enim facilisis gravida neque convallis. Aliquam malesuada bibendum arcu "
+    "vitae elementum curabitur. Platea dictumst quisque sagittis purus sit amet "
+    "volutpat. Blandit libero volutpat sed cras ornare. In eu mi bibendum neque "
+    "egestas. Egestas dui id ornare arcu odio. Pulvinar neque laoreet suspendisse "
+    "interdum. Fames ac turpis egestas integer eget aliquet nibh praesent tristique. Et"
+    " egestas quis ipsum suspendisse ultrices gravida dictum fusce. Malesuada fames ac "
+    "turpis egestas. Tincidunt nunc pulvinar sapien et ligula ullamcorper malesuada "
+    "proin libero. In arcu cursus euismod quis viverra. Faucibus in ornare quam viverra"
+    ". Curabitur vitae nunc sed velit dignissim sodales ut eu sem. Velit scelerisque in"
+    " dictum non consectetur a erat nam. Proin fermentum leo vel orci porta non. Fames"
+    " ac turpis egestas sed tempus. Vitae justo eget magna fermentum iaculis eu non. "
+    "Imperdiet massa tincidunt nunc pulvinar sapien et ligula. Laoreet sit amet cursus "
+    "sit amet dictum sit amet. Quis hendrerit dolor magna eget. Orci ac auctor augue "
+    "mauris augue. Consequat interdum varius sit amet mattis. At ultrices mi tempus "
+    "imperdiet nulla malesuada pellentesque elit. Volutpat est velit egestas dui. "
+    "Potenti nullam ac tortor vitae. At tempor commodo ullamcorper a lacus vestibulum "
+    "sed arcu non. Duis ut diam quam nulla. Vestibulum mattis ullamcorper velit sed "
+    "ullamcorper. Sit amet commodo nulla facilisi nullam vehicula. Faucibus purus in "
+    "massa tempor nec feugiat. Sem fringilla ut morbi tincidunt augue interdum velit. "
+    "Etiam dignissim diam quis enim lobortis scelerisque fermentum dui. Nunc vel risus "
+    "commodo viverra maecenas accumsan. Aenean sed adipiscing diam donec adipiscing "
+    "tristique. Maecenas accumsan lacus vel facilisis volutpat est velit egestas. Nulla"
+    " aliquet porttitor lacus luctus accumsan tortor posuere ac. Habitant morbi "
+    "tristique senectus et netus et. Eget mi proin sed libero enim sed faucibus turpis "
+    "in. Vulputate enim nulla aliquet porttitor lacus. Dui ut ornare lectus sit amet "
+    "est. Quam lacus suspendisse faucibus interdum posuere. Sagittis orci a scelerisque"
+    " purus semper eget duis at tellus. Tellus molestie nunc non blandit massa. Feugiat"
+    " vivamus at augue eget arcu dictum varius duis at. Varius morbi enim nunc faucibus"
+    " a pellentesque sit. Id aliquet lectus proin nibh nisl condimentum id venenatis a."
+    " Tortor dignissim convallis aenean et tortor at risus viverra adipiscing. Aliquam "
+    "malesuada bibendum arcu vitae elementum curabitur vitae nunc sed. Habitasse platea"
+    " dictumst quisque sagittis purus sit amet volutpat. Vitae auctor eu augue ut "
+    "lectus. At varius vel pharetra vel turpis nunc eget. Dictum at tempor  commodo "
+    "ullamcorper a lacus vestibulum sed arcu. Pellentesque massa placerat duis "
+    "ultricies. Enim nunc faucibus a pellentesque sit amet porttitor eget dolor. "
+    "Volutpat blandit aliquam etiam erat velit scelerisque in. Amet mattis vulputate "
+    "enim nulla aliquet porttitor. Egestas maecenas pharetra convallis posuere morbi "
+    "leo urna molestie. Duis ut diam quam nulla porttitor massa id. In fermentum "
+    "posuere urna nec tincidunt praesent. Turpis egestas sed tempus urna et pharetra "
+    "pharetra massa. Tellus molestie nunc non blandit massa. Diam phasellus vestibulum "
+    "lorem sed risus ultricies. Egestas erat imperdiet sed euismod nisi porta lorem. "
+    "Quam viverra orci sagittis eu volutpat odio facilisis mauris sit. Ornare aenean "
+    "euismod elementum nisi quis. Laoreet non curabitur gravida arcu ac tortor "
+    "dignissim convallis aenean. Sagittis aliquam malesuada bibendum arcu vitae "
+    "elementum. Sed blandit libero volutpat sed cras ornare. Sagittis eu volutpat odio "
+    "facilisis mauris. Facilisis volutpat est velit egestas dui id ornare arcu odio. "
+    "Eu feugiat pretium  nibh."
+)
+
+
+@pytest.mark.parametrize(
+    "cli_description, cli_long_description, channel_description, raises, "
+    "expected_description, expected_long_description",
+    [
+        # CLI description set and is short, CLI long descripion not set, channel
+        # description doe not matter
+        (
+            LONG_TEXT[0:MAX_DESC_LEN],
+            None,
+            TEXT_NOT_USED,
+            False,
+            LONG_TEXT[0:MAX_DESC_LEN],
+            None,
+        ),
+        # CLI description set and is too long, channel description doe not matter
+        (LONG_TEXT[0 : MAX_DESC_LEN + 1], None, TEXT_NOT_USED, True, None, None),
+        # CLI description not set and channel description is short enough
+        (None, None, LONG_TEXT[0:MAX_DESC_LEN], False, LONG_TEXT[0:MAX_DESC_LEN], None),
+        # CLI description not set and channel description is too long for description
+        # but ok for long description
+        (
+            None,
+            None,
+            LONG_TEXT[0 : MAX_DESC_LEN + 1],
+            False,
+            LONG_TEXT[0 : MAX_DESC_LEN - 1] + "…",
+            LONG_TEXT[0 : MAX_DESC_LEN + 1],
+        ),
+        (
+            None,
+            None,
+            LONG_TEXT[0:MAX_LONG_DESC_LEN],
+            False,
+            LONG_TEXT[0 : MAX_DESC_LEN - 1] + "…",
+            LONG_TEXT[0:MAX_LONG_DESC_LEN],
+        ),
+        # CLI description not set and channel description is too long for description
+        # and long description
+        (
+            None,
+            None,
+            LONG_TEXT[0 : MAX_LONG_DESC_LEN + 1],
+            False,
+            LONG_TEXT[0 : MAX_DESC_LEN - 1] + "…",
+            LONG_TEXT[0 : MAX_LONG_DESC_LEN - 1] + "…",
+        ),
+        # CLI description set and is short, CLI long descripion set and is short,
+        # channel description does not matter
+        (
+            LONG_TEXT[0:MAX_DESC_LEN],
+            LONG_TEXT[0:MAX_LONG_DESC_LEN],
+            TEXT_NOT_USED,
+            False,
+            LONG_TEXT[0:MAX_DESC_LEN],
+            LONG_TEXT[0:MAX_LONG_DESC_LEN],
+        ),
+        # CLI description set and is short, CLI long descripion set and is too long,
+        # channel description does not matter
+        (
+            LONG_TEXT[0:MAX_DESC_LEN],
+            LONG_TEXT[0 : MAX_LONG_DESC_LEN + 1],
+            TEXT_NOT_USED,
+            True,
+            None,
+            None,
+        ),
+        # CLI description not set, CLI long descripion set and is short,
+        # channel description does not matter
+        (
+            None,
+            LONG_TEXT[0:MAX_LONG_DESC_LEN],
+            TEXT_NOT_USED,
+            True,
+            None,
+            None,
+        ),
+    ],
+)
+def test_description(
+    cli_description: str,
+    cli_long_description: str,
+    channel_description: str,
+    *,
+    raises: bool,
+    expected_description: str,
+    expected_long_description: str,
+    scraper_generator: Callable[..., Kolibri2Zim],
+):
+    if channel_description:
+        scraper = scraper_generator(
+            channel_description=channel_description,
+            additional_options={
+                "description": cli_description,
+                "long_description": cli_long_description,
+            },
+        )
+    else:
+        scraper = scraper_generator(
+            additional_options={
+                "description": cli_description,
+                "long_description": cli_long_description,
+            }
+        )
+
+    if raises:
+        with pytest.raises(ValueError):
+            scraper.sanitize_inputs()
+        return
+    else:
+        scraper.sanitize_inputs()
+
+    assert scraper.description == expected_description
+    assert scraper.long_description == expected_long_description

From 33cab630979c0f6f325e1441f690c61dc78f1e05 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 17:10:39 +0200
Subject: [PATCH 33/45] Fix wrong method for items

---
 src/kolibri2zim/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py
index 1bf0fcb..d96cde6 100644
--- a/src/kolibri2zim/scraper.py
+++ b/src/kolibri2zim/scraper.py
@@ -282,7 +282,7 @@ def funnel_from_s3(self, file_id, path, checksum, preset):
                 "fileobj": fileobj,
                 "mimetype": preset.mimetype,
             }
-            self.creator.add_item_for(StaticItem(**kwargs))
+            self.creator.add_item(StaticItem(**kwargs))
         logger.debug(f"Added {path} from S3::{key}")
         return True
 

From f69c68a9f4300de571bb0972dc10c082ddda6838 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 17:11:23 +0200
Subject: [PATCH 34/45] Move code back to its original position, should not
 have been moved

---
 src/kolibri2zim/scraper.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py
index d96cde6..7415e53 100644
--- a/src/kolibri2zim/scraper.py
+++ b/src/kolibri2zim/scraper.py
@@ -753,6 +753,25 @@ def add_html5_node(self, node_id):
         logger.debug(f"Added HTML5 node #{node_id}")
 
     def run(self):
+        if self.s3_url_with_credentials and not self.s3_credentials_ok():
+            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
+
+        s3_msg = (
+            f"  using cache: {self.s3_storage.url.netloc} "
+            f"with bucket: {self.s3_storage.bucket_name}"
+            if self.s3_storage
+            else ""
+        )
+        logger.info(
+            f"Starting scraper with:\n"
+            f"  channel_id: {self.channel_id}\n"
+            f"  build_dir: {self.build_dir}\n"
+            f"  output_dir: {self.output_dir}\n"
+            f"  using webm : {self.use_webm}\n"
+            f"  low_quality : {self.low_quality}\n"
+            f"{s3_msg}"
+        )
+
         self.ensure_js_deps_are_present()
 
         logger.info("Download database")
@@ -1093,22 +1112,3 @@ def ensure_js_deps_are_present(self):
                     "It looks like JS deps have not been installed,"
                     f" {js_deps_dir} is missing"
                 )
-
-        if self.s3_url_with_credentials and not self.s3_credentials_ok():
-            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
-
-        s3_msg = (
-            f"  using cache: {self.s3_storage.url.netloc} "
-            f"with bucket: {self.s3_storage.bucket_name}"
-            if self.s3_storage
-            else ""
-        )
-        logger.info(
-            f"Starting scraper with:\n"
-            f"  channel_id: {self.channel_id}\n"
-            f"  build_dir: {self.build_dir}\n"
-            f"  output_dir: {self.output_dir}\n"
-            f"  using webm : {self.use_webm}\n"
-            f"  low_quality : {self.low_quality}\n"
-            f"{s3_msg}"
-        )

From f6ede9f9e4dbd0d0b037615d14652e164a293f85 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 17:11:58 +0200
Subject: [PATCH 35/45] Simplify ensure_js_deps_are_present

---
 src/kolibri2zim/constants.py | 14 ++++++++++++++
 src/kolibri2zim/scraper.py   | 30 ++++--------------------------
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/src/kolibri2zim/constants.py b/src/kolibri2zim/constants.py
index f8a27fb..53e0aa5 100644
--- a/src/kolibri2zim/constants.py
+++ b/src/kolibri2zim/constants.py
@@ -20,6 +20,20 @@
 STUDIO_DEFAULT_BASE_URL = "https://studio.learningequality.org"
 STUDIO_URL = os.getenv("STUDIO_URL", STUDIO_DEFAULT_BASE_URL)
 
+# when modifiying this list, update list in hatch_build.py as well
+JS_DEPS: list[str] = [
+    "pdfjs",
+    "videojs",
+    "ogvjs",
+    "bootstrap",
+    "bootstrap-icons",
+    "perseus",
+    "epub.min.js",
+    "jszip.min.js",
+    "jquery.min.js",
+    "videojs-ogvjs.js",
+]
+
 
 def is_running_inside_container():
     fpath = pathlib.Path("/proc/self/cgroup")
diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py
index 7415e53..c72d7ea 100644
--- a/src/kolibri2zim/scraper.py
+++ b/src/kolibri2zim/scraper.py
@@ -7,7 +7,6 @@
 import hashlib
 import io
 import json
-import os
 import shutil
 import tempfile
 import threading
@@ -33,7 +32,7 @@
 from zimscraperlib.zim.creator import Creator
 from zimscraperlib.zim.items import StaticItem
 
-from kolibri2zim.constants import ROOT_DIR, STUDIO_URL, get_logger
+from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, get_logger
 from kolibri2zim.database import KolibriDB
 from kolibri2zim.debug import (
     ON_DISK_THRESHOLD,
@@ -1085,30 +1084,9 @@ def add_custom_about_and_css(self):
         logger.debug("Added about page and custom CSS")
 
     def ensure_js_deps_are_present(self):
-        for js_deps_file in [
-            "epub.min.js",
-            "jszip.min.js",
-            "jquery.min.js",
-            "videojs-ogvjs.js",
-        ]:
-            if not os.path.exists(
-                self.templates_dir.joinpath(f"assets/{js_deps_file}")
-            ):
-                raise ValueError(
-                    "It looks like JS deps have not been installed,"
-                    f" {js_deps_file} is missing"
-                )
-
-        for js_deps_dir in [
-            "pdfjs",
-            "videojs",
-            "ogvjs",
-            "bootstrap",
-            "bootstrap-icons",
-            "perseus",
-        ]:
-            if not os.path.exists(self.templates_dir.joinpath(f"assets/{js_deps_dir}")):
+        for dep in JS_DEPS:
+            if not self.templates_dir.joinpath(f"assets/{dep}").exists():
                 raise ValueError(
                     "It looks like JS deps have not been installed,"
-                    f" {js_deps_dir} is missing"
+                    f" {dep} is missing"
                 )

From 0a4082dbec93279f14fa1e85a2d3ac36592eadcf Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 17:48:24 +0200
Subject: [PATCH 36/45] Ensure assets are not downloaded twice at build time

---
 .github/workflows/publish.yml |  2 +-
 Dockerfile                    |  2 +-
 hatch_build.py                | 29 +++++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 7fdc962..108c895 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -22,7 +22,7 @@ jobs:
       - name: Build packages
         run: |
           pip install -U pip hatch
-          HATCH_BUILD_HOOKS_ENABLE=true hatch build
+          hatch build
 
       - name: Upload to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1.8
diff --git a/Dockerfile b/Dockerfile
index 89a7302..e38f96d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,7 +12,7 @@ RUN apt-get update -y \
 #RUN pip3 install --no-cache-dir -r /src/requirements.txt
 COPY kolibri2zim /src/kolibri2zim
 COPY pyproject.toml *.md get_js_deps.sh install.sh MANIFEST.in LICENSE *.py /src/
-RUN cd /src/ && HATCH_BUILD_HOOKS_ENABLE=true hatch build -t sdist && ./install.sh
+RUN cd /src/ && hatch build -t sdist && ./install.sh
 
 # default output directory
 RUN mkdir -p /output
diff --git a/hatch_build.py b/hatch_build.py
index d98f9e3..1ecbbe0 100644
--- a/hatch_build.py
+++ b/hatch_build.py
@@ -4,13 +4,42 @@
 
 from hatchling.builders.hooks.plugin.interface import BuildHookInterface
 
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
+# update list in constants.py as well
+JS_DEPS = [
+    "pdfjs",
+    "videojs",
+    "ogvjs",
+    "bootstrap",
+    "bootstrap-icons",
+    "perseus",
+    "epub.min.js",
+    "jszip.min.js",
+    "jquery.min.js",
+    "videojs-ogvjs.js",
+]
+
 
 class GetJsDepsHook(BuildHookInterface):
     def initialize(self, version, build_data):
+        if self.deps_already_installed():
+            logger.info("JS dependencies are already installed, skipping it")
+            return
+        Path(self.root).joinpath("src/kolibri2zim/templates/assets")
         subprocess.run(
             str(Path(self.root).joinpath("get_js_deps.sh")),  # noqa : S603
             check=True,
         )
         return super().initialize(version, build_data)
+
+    def deps_already_installed(self) -> bool:
+        for dep in JS_DEPS:
+            if (
+                not Path(self.root)
+                .joinpath(f"src/kolibri2zim/templates/assets/{dep}")
+                .exists()
+            ):
+                return False
+        return True

From 69200e4e28be4333d96bf5c5aac7117cfec446d5 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 18:18:35 +0200
Subject: [PATCH 37/45] Adapt to Python bootstrap changes

---
 Dockerfile     |  29 +++++----
 install.sh     |   5 --
 pyproject.toml | 160 ++++++++++++++++++++++++-------------------------
 3 files changed, 94 insertions(+), 100 deletions(-)
 delete mode 100755 install.sh

diff --git a/Dockerfile b/Dockerfile
index e38f96d..743bd2b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,18 +1,23 @@
-FROM python:3.11-bullseye
-LABEL org.opencontainers.image.source https://github.com/openzim/kolibri2zim
+FROM python:3.11-bookworm
+LABEL org.opencontainers.image.source https://github.com/openzim/kolibri
 
 # Install necessary packages
-RUN apt-get update -y \
-    && apt-get install -y --no-install-recommends locales-all unzip ffmpeg \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/* \
-    && python -m pip install -U pip hatch
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+      locales-all \
+      unzip \
+      ffmpeg \
+ && rm -rf /var/lib/apt/lists/* \
+ && python -m pip install --no-cache-dir -U \
+      pip
 
-#COPY requirements.txt /src/
-#RUN pip3 install --no-cache-dir -r /src/requirements.txt
-COPY kolibri2zim /src/kolibri2zim
-COPY pyproject.toml *.md get_js_deps.sh install.sh MANIFEST.in LICENSE *.py /src/
-RUN cd /src/ && hatch build -t sdist && ./install.sh
+# Copy code + associated artifacts
+COPY src /src/src
+COPY pyproject.toml *.md get_js_deps.sh MANIFEST.in LICENSE *.py /src/
+
+# Install + cleanup
+RUN pip install --no-cache-dir /src \
+ && rm -rf /src
 
 # default output directory
 RUN mkdir -p /output
diff --git a/install.sh b/install.sh
deleted file mode 100755
index 0b6a66b..0000000
--- a/install.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-search_dir=/src/dist/*.tar.gz
-for entry in $search_dir
-do
-  pip install "$entry"
-done
diff --git a/pyproject.toml b/pyproject.toml
index 6bd8c4f..d09a6e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,12 +34,13 @@ test = [
   "coverage==7.2.7",
 ]
 dev = [
-    "debugpy",
+    "pre-commit==3.3.3",
+    "debugpy==1.6.7",
     "kolibri2zim[scripts]",
     "kolibri2zim[lint]",
     "kolibri2zim[test]",
     "kolibri2zim[check]",
-    "hatchling",
+    "hatchling==1.18.0",
 ]
 
 [project.urls]
@@ -99,84 +100,81 @@ target-version = ['py311']
 [tool.ruff]
 target-version = "py311"
 line-length = 88
-src = ["kolibri2zim"]
+src = ["src"]
 select = [
-    # "A", # flake8-builtins
-    # "ANN",  # flake8-annotations
-    "ARG", # flake8-unused-arguments
-    # "ASYNC",  # flake8-async
-    # "B", # flake8-bugbear
-    # "BLE",  # flake8-blind-except
-    "C4",  # flake8-comprehensions
-    "C90", # mccabe
-    # "COM",  # flake8-commas
-    # "D",  # pydocstyle
-    # "DJ",  # flake8-django
-    "DTZ", # flake8-datetimez
-    "E",   # pycodestyle (default)
-    "EM",  # flake8-errmsg
-    # "ERA",  # eradicate
-    # "EXE",  # flake8-executable
-    "F", # Pyflakes (default)
-    # "FA",  # flake8-future-annotations
-    "FBT", # flake8-boolean-trap
-    # "FLY",  # flynt
-    # "G",  # flake8-logging-format
-    "I",   # isort
-    "ICN", # flake8-import-conventions
-    # "INP",  # flake8-no-pep420
-    # "INT",  # flake8-gettext
-    "ISC", # flake8-implicit-str-concat
-    "N",   # pep8-naming
-    # "NPY",  # NumPy-specific rules
-    # "PD",  # pandas-vet
-    # "PGH",  # pygrep-hooks
-    # "PIE",  # flake8-pie
-    # "PL",  # Pylint
-    "PLC", # Pylint: Convention
-    "PLE", # Pylint: Error
-    "PLR", # Pylint: Refactor
-    "PLW", # Pylint: Warning
-    # "PT",  # flake8-pytest-style
-    # "PTH",  # flake8-use-pathlib
-    # "PYI",  # flake8-pyi
-    "Q", # flake8-quotes
-    # "RET",  # flake8-return
-    # "RSE",  # flake8-raise
-    "RUF", # Ruff-specific rules
-    "S",   # flake8-bandit
-    # "SIM",  # flake8-simplify
-    # "SLF",  # flake8-self
-    "T10", # flake8-debugger
-    "T20", # flake8-print
-    # "TCH",  # flake8-type-checking
-    # "TD",  # flake8-todos
-    "TID", # flake8-tidy-imports
-    # "TRY",  # tryceratops
-    "UP",  # pyupgrade
-    "W",   # pycodestyle
-    "YTT", # flake8-2020
+  "A",  # flake8-builtins
+  # "ANN",  # flake8-annotations
+  "ARG",  # flake8-unused-arguments
+  # "ASYNC",  # flake8-async
+  "B",  # flake8-bugbear
+  # "BLE",  # flake8-blind-except
+  "C4",  # flake8-comprehensions
+  "C90",  # mccabe
+  # "COM",  # flake8-commas
+  # "D",  # pydocstyle
+  # "DJ",  # flake8-django
+  "DTZ",  # flake8-datetimez
+  "E",  # pycodestyle (default)
+  "EM",  # flake8-errmsg
+  # "ERA",  # eradicate
+  # "EXE",  # flake8-executable
+  "F",  # Pyflakes (default)
+  # "FA",  # flake8-future-annotations
+  "FBT",  # flake8-boolean-trap
+  # "FLY",  # flynt
+  # "G",  # flake8-logging-format
+  "I",  # isort
+  "ICN",  # flake8-import-conventions
+  # "INP",  # flake8-no-pep420
+  # "INT",  # flake8-gettext
+  "ISC",  # flake8-implicit-str-concat
+  "N",  # pep8-naming
+  # "NPY",  # NumPy-specific rules
+  # "PD",  # pandas-vet
+  # "PGH",  # pygrep-hooks
+  # "PIE",  # flake8-pie
+  # "PL",  # Pylint
+  "PLC",  # Pylint: Convention
+  "PLE",  # Pylint: Error
+  "PLR",  # Pylint: Refactor
+  "PLW",  # Pylint: Warning
+  # "PT",  # flake8-pytest-style
+  # "PTH",  # flake8-use-pathlib
+  # "PYI",  # flake8-pyi
+  "Q",  # flake8-quotes
+  # "RET",  # flake8-return
+  # "RSE",  # flake8-raise
+  "RUF",  # Ruff-specific rules
+  "S",  # flake8-bandit
+  # "SIM",  # flake8-simplify
+  # "SLF",  # flake8-self
+  "T10",  # flake8-debugger
+  "T20",  # flake8-print
+  # "TCH",  # flake8-type-checking
+  # "TD",  # flake8-todos
+  "TID",  # flake8-tidy-imports
+  # "TRY",  # tryceratops
+  "UP",  # pyupgrade
+  "W",  # pycodestyle
+  "YTT",  # flake8-2020
 ]
 ignore = [
-    # Allow non-abstract empty methods in abstract base classes
-    "B027",
-    "EM",
-    # Allow boolean positional values in function calls, like `dict.get(... True)`
-    "FBT003",
-    # Ignore checks for possible passwords
-    "S105",
-    "S106",
-    "S107",
-    # Ignore complexity
-    "C901",
-    "PLR0911",
-    "PLR0912",
-    "PLR0913",
-    "PLR0915",
+  # Allow non-abstract empty methods in abstract base classes
+  "B027",
+  # Remove flake8-errmsg since we consider they bloat the code and provide limited value
+  "EM",
+  # Allow boolean positional values in function calls, like `dict.get(... True)`
+  "FBT003",
+  # Ignore checks for possible passwords
+  "S105", "S106", "S107",
+  # Ignore warnings on subprocess.run / popen
+  "S603",
+  # Ignore complexity
+  "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
 ]
 unfixable = [
-    # Don't touch unused imports
-    "F401",
+  # Don't touch unused imports
+  "F401",
 ]
 
 [tool.ruff.isort]
@@ -214,16 +212,12 @@ exclude_lines = [
 ]
 
 [tool.pyright]
-pythonVersion = "3.11"
-pythonPlatform = "All"
-typeCheckingMode = "basic"
-
 include = ["src", "tests", "tasks.py"]
 exclude = ["**/node_modules",
     "**/__pycache__",
     "src/kolibri2zim/templates",
 ]
-
-executionEnvironments= [
-    { root= "src" }
-]
+extraPaths = ["src"]
+pythonVersion = "3.11"
+pythonPlatform = "All"
+typeCheckingMode="basic"

From 6896f310d7f1fc9ae92f3562101196ab063bbd83 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 18:31:12 +0200
Subject: [PATCH 38/45] Update pyright to 1.1.318

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d09a6e2..d3a39d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ dynamic = ["version"]
 [project.optional-dependencies]
 scripts = ["invoke==2.1.3"]
 lint = ["black==23.3.0", "ruff==0.0.272"]
-check = ["pyright==1.1.317"]
+check = ["pyright==1.1.318"]
 test = [
   "pytest==7.4.0",
   "coverage==7.2.7",

From aa794c258e7d783a696de04ee533d895cd2a868b Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 18:31:38 +0200
Subject: [PATCH 39/45] Fix few quality issues

---
 dump_channel_to_fs.py         |  2 +-
 hatch_build.py                |  2 +-
 src/kolibri2zim/entrypoint.py |  2 +-
 tests/conftest.py             | 13 +++----------
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py
index ad32cc0..4c67395 100755
--- a/dump_channel_to_fs.py
+++ b/dump_channel_to_fs.py
@@ -44,7 +44,7 @@ def download_if_missing(url, fpath, fsize=None, *, force=False):
     if not skipped:
         fpath.unlink(missing_ok=True)
         wget = subprocess.run(
-            [  # noqa: S603
+            [
                 "/usr/bin/env",
                 "wget",
                 "-t",
diff --git a/hatch_build.py b/hatch_build.py
index 1ecbbe0..314e24a 100644
--- a/hatch_build.py
+++ b/hatch_build.py
@@ -29,7 +29,7 @@ def initialize(self, version, build_data):
             return
         Path(self.root).joinpath("src/kolibri2zim/templates/assets")
         subprocess.run(
-            str(Path(self.root).joinpath("get_js_deps.sh")),  # noqa : S603
+            str(Path(self.root).joinpath("get_js_deps.sh")),  # : S603
             check=True,
         )
         return super().initialize(version, build_data)
diff --git a/src/kolibri2zim/entrypoint.py b/src/kolibri2zim/entrypoint.py
index 856bea2..34bc34b 100755
--- a/src/kolibri2zim/entrypoint.py
+++ b/src/kolibri2zim/entrypoint.py
@@ -206,7 +206,7 @@ def main():
         logger.error(f"FAILED. An error occurred: {exc}")
         if args.debug:
             logger.exception(exc)
-        raise SystemExit(1)
+        raise SystemExit(1) from exc
 
 
 if __name__ == "__main__":
diff --git a/tests/conftest.py b/tests/conftest.py
index 0210377..af47abf 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -35,12 +35,13 @@ def _scraper(
         channel_name: str = CHANNEL_NAME,
         channel_description: str = CHANNEL_DESCRIPTION,
         channel_author: str | None = None,
-        additional_options: dict[str, Any] = {},
+        additional_options: dict[str, Any] | None = None,
     ) -> Kolibri2Zim:
         options = {}
         for option_key in expected_options_keys:
             options[option_key] = None
-        options.update(additional_options)
+        if additional_options:
+            options.update(additional_options)
         scraper = Kolibri2Zim(**options)
         scraper.db = FakeDb(
             channel_author=channel_author,
@@ -50,11 +51,3 @@ def _scraper(
         return scraper
 
     yield _scraper
-
-
-# @pytest.fixture
-# def default_options() -> Generator[dict[str, Any], None, None]:
-#     default_options = {}
-#     for option in options:
-#         default_options[option] = None
-#     yield default_options

From e57b03d183498ed07eacb76647bf02938cc3fd87 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Jul 2023 18:34:02 +0200
Subject: [PATCH 40/45] Make it clear why we need hatchling in dev environment

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index d3a39d4..860390e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ dev = [
     "kolibri2zim[lint]",
     "kolibri2zim[test]",
     "kolibri2zim[check]",
+    # hatchling is a dev dependency only needed for hook development on developer machine
     "hatchling==1.18.0",
 ]
 

From 319d9f9efba39a9aa880ea259a95c8c6fcc261ce Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 25 Jul 2023 08:03:22 +0200
Subject: [PATCH 41/45] Use build instead of hatch

---
 .github/workflows/publish.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 108c895..98c52fc 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,8 +21,8 @@ jobs:
 
       - name: Build packages
         run: |
-          pip install -U pip hatch
-          hatch build
+          pip install -U pip build
+          python -m build sdist wheel
 
       - name: Upload to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1.8
@@ -31,7 +31,6 @@ jobs:
         uses: openzim/docker-publish-action@v10
         with:
           image-name: openzim/kolibri
-          on-master: dev
           tag-pattern: /^v([0-9.]+)$/
           latest-on-tag: true
           restrict-to: openzim/kolibri

From 2673c688f8b653117030b3e1eef2998ade5473ef Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 25 Jul 2023 08:07:37 +0200
Subject: [PATCH 42/45] Small fixes / change revert following review

---
 hatch_build.py             |  5 +++--
 pyproject.toml             |  2 ++
 src/kolibri2zim/scraper.py | 44 ++++++++++++++++++--------------------
 3 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/hatch_build.py b/hatch_build.py
index 314e24a..86dfa52 100644
--- a/hatch_build.py
+++ b/hatch_build.py
@@ -29,7 +29,7 @@ def initialize(self, version, build_data):
             return
         Path(self.root).joinpath("src/kolibri2zim/templates/assets")
         subprocess.run(
-            str(Path(self.root).joinpath("get_js_deps.sh")),  # : S603
+            str(Path(self.root).joinpath("get_js_deps.sh")),
             check=True,
         )
         return super().initialize(version, build_data)
@@ -38,7 +38,8 @@ def deps_already_installed(self) -> bool:
         for dep in JS_DEPS:
             if (
                 not Path(self.root)
-                .joinpath(f"src/kolibri2zim/templates/assets/{dep}")
+                .joinpath("src/kolibri2zim/templates/assets")
+                .joinpath(dep)
                 .exists()
             ):
                 return False
diff --git a/pyproject.toml b/pyproject.toml
index 860390e..96879ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -162,6 +162,8 @@ select = [
 ignore = [
   # Allow non-abstract empty methods in abstract base classes
   "B027",
+  # Allow use of datetime with tz and date.today
+  "DTZ005", "DTZ011",
   # Remove flake8-errmsg since we consider they bloat the code and provide limited value
   "EM",
   # Allow boolean positional values in function calls, like `dict.get(... True)`
diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py
index c72d7ea..11a28db 100644
--- a/src/kolibri2zim/scraper.py
+++ b/src/kolibri2zim/scraper.py
@@ -32,7 +32,7 @@
 from zimscraperlib.zim.creator import Creator
 from zimscraperlib.zim.items import StaticItem
 
-from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, get_logger
+from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, Global, get_logger
 from kolibri2zim.database import KolibriDB
 from kolibri2zim.debug import (
     ON_DISK_THRESHOLD,
@@ -108,11 +108,12 @@ def go(option):
 
         # zim params
         self.fname = go("fname")
-        tags = go("tags")
-        if tags is None:
-            self.tags = []
-        else:
-            self.tags = [t.strip() for t in tags.split(",")]
+        self.tags = (
+            []
+            if go("tags") is None
+            else [t.strip() for t in go("tags").split(",")]  # pyright: ignore
+        )
+
         self.title = go("title")
         self.description = go("description")
         self.long_description = go("long_description")
@@ -126,17 +127,14 @@ def go(option):
         self.css = go("css")
 
         # directory setup
-        self.output_dir = Path(str(go("output_dir"))).expanduser().resolve()
-        tmp_dir = go("tmp_dir")
-        if tmp_dir:
-            Path(tmp_dir).mkdir(parents=True, exist_ok=True)
-        self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir))
+        self.output_dir = Path(go("output_dir") or "/output").expanduser().resolve()
+        if go("tmp_dir"):
+            Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True)  # pyright: ignore
+        self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir")))
 
         # performances options
-        nb_threads_str = go("threads")
-        self.nb_threads = int(nb_threads_str) if nb_threads_str else None
-        nb_processes_str = go("processes")
-        self.nb_processes = int(nb_processes_str) if nb_processes_str else None
+        self.nb_threads = int(go("threads") or 1)
+        self.nb_processes = int(go("processes") or Global.nb_available_cpus)
         self.s3_url_with_credentials = go("s3_url_with_credentials")
         self.s3_storage = None
         self.dedup_html_files = go("dedup_html_files")
@@ -146,9 +144,10 @@ def go(option):
         self.keep_build_dir = go("keep_build_dir")
         self.debug = go("debug")
         self.only_topics = go("only_topics")
-        node_ids = go("node_ids")
         self.node_ids = (
-            None if node_ids is None else [t.strip() for t in node_ids.split(",")]
+            None
+            if go("node_ids") is None
+            else [t.strip() for t in go("node_ids").split(",")]  # pyright: ignore
         )
 
         # jinja2 environment setup
@@ -823,7 +822,7 @@ def run(self):
             LongDescription=self.long_description,
             Creator=self.author,
             Publisher=self.publisher,
-            Date=datetime.datetime.now(datetime.UTC),
+            Date=datetime.date.today(),
             Illustration_48x48_at_1=self.favicon_48_fpath.read_bytes(),
         )
         self.creator.start()
@@ -871,9 +870,8 @@ def run(self):
                     f"FAILURE not_done={len(result.not_done)} done={len(result.done)}"
                 )
                 for future in result.done:
-                    future_exception = future.exception()
-                    if future_exception:
-                        raise future_exception
+                    if future.exception():
+                        raise future.exception()  # pyright:ignore
         except KeyboardInterrupt:
             self.creator.can_finish = False
             logger.error("KeyboardInterrupt, exiting.")
@@ -929,12 +927,12 @@ def sanitize_inputs(self):
         channel_meta = self.db.get_channel_metadata(self.channel_id)
 
         # input  & metadata sanitation
-        period = datetime.datetime.now(datetime.UTC).strftime("%Y-%m")
+        period = datetime.datetime.now().strftime("%Y-%m")
         if self.fname:
             # make sure we were given a filename and not a path
             fname_path = Path(str(self.fname).format(period=period))
             if Path(fname_path.name) != fname_path:
-                raise ValueError(f"filename is not a filename: {fname_path}")
+                raise ValueError(f"filename is not a filename: {self.fname}")
             self.clean_fname = str(fname_path)
         else:
             self.clean_fname = f"{self.name}_{period}.zim"

From dfaf970ad1f2911c162280103685e6b39975855e Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 25 Jul 2023 08:08:28 +0200
Subject: [PATCH 43/45] Simplify code + add support for seting only the long
 description

---
 src/kolibri2zim/scraper.py    | 47 ++++++++++++++---------------------
 tests/test_sanitize_inputs.py | 12 ++++-----
 2 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py
index 11a28db..412bcf4 100644
--- a/src/kolibri2zim/scraper.py
+++ b/src/kolibri2zim/scraper.py
@@ -941,36 +941,25 @@ def sanitize_inputs(self):
             self.title = channel_meta["name"]
         self.title = self.title.strip()
 
+        if self.description and len(self.description) > MAX_DESC_LENGTH:
+            raise ValueError(
+                f"Description too long ({len(self.description)}>{MAX_DESC_LENGTH})"
+            )
+        if self.long_description and len(self.long_description) > MAX_LONG_DESC_LENGTH:
+            raise ValueError(
+                f"LongDescription too long ({len(self.long_description)}"
+                f">{MAX_LONG_DESC_LENGTH})"
+            )
+
+        kolibri_desc = channel_meta["description"].strip()
+        if not self.long_description and len(kolibri_desc) > MAX_DESC_LENGTH:
+            self.long_description = kolibri_desc[0:MAX_LONG_DESC_LENGTH]
+            if len(kolibri_desc) > MAX_LONG_DESC_LENGTH:
+                self.long_description = self.long_description[:-1] + "…"
         if not self.description:
-            # User did not provided a description, we will infer it from channel
-            # metadata, limited to maximum length
-            if self.long_description:
-                raise ValueError(
-                    "long_description cannot be set if description is not set"
-                )
-            self.description = channel_meta["description"].strip()
-            if len(self.description) > MAX_DESC_LENGTH:
-                self.long_description = self.description
-                self.description = f"{self.description[0:MAX_DESC_LENGTH-1]}…"
-                if len(self.long_description) > MAX_LONG_DESC_LENGTH:
-                    self.long_description = (
-                        f"{self.long_description[0:MAX_LONG_DESC_LENGTH-1]}…"
-                    )
-        else:
-            self.description = self.description.strip()
-            if len(self.description) > MAX_DESC_LENGTH:
-                raise ValueError(
-                    f"description is too long ({len(self.description)}"
-                    f">{MAX_DESC_LENGTH})"
-                )
-            if (
-                self.long_description
-                and len(self.long_description) > MAX_LONG_DESC_LENGTH
-            ):
-                raise ValueError(
-                    f"long_description is too long ({len(self.long_description)}"
-                    f">{MAX_LONG_DESC_LENGTH})"
-                )
+            self.description = kolibri_desc[0:MAX_DESC_LENGTH]
+            if len(kolibri_desc) > MAX_DESC_LENGTH:
+                self.description = self.description[:-1] + "…"
 
         if not self.author:
             self.author = channel_meta["author"] or "Kolibri"
diff --git a/tests/test_sanitize_inputs.py b/tests/test_sanitize_inputs.py
index 1504c12..0569c09 100644
--- a/tests/test_sanitize_inputs.py
+++ b/tests/test_sanitize_inputs.py
@@ -92,7 +92,7 @@ def test_sanitize_defaults_ok(scraper_generator: Callable[..., Kolibri2Zim]):
             LONG_TEXT[0:MAX_DESC_LEN],
             None,
         ),
-        # CLI description set and is too long, channel description doe not matter
+        # CLI description set and is too long, channel description does not matter
         (LONG_TEXT[0 : MAX_DESC_LEN + 1], None, TEXT_NOT_USED, True, None, None),
         # CLI description not set and channel description is short enough
         (None, None, LONG_TEXT[0:MAX_DESC_LEN], False, LONG_TEXT[0:MAX_DESC_LEN], None),
@@ -145,14 +145,14 @@ def test_sanitize_defaults_ok(scraper_generator: Callable[..., Kolibri2Zim]):
             None,
         ),
         # CLI description not set, CLI long descripion set and is short,
-        # channel description does not matter
+        # channel description set to something different than long desc
         (
             None,
             LONG_TEXT[0:MAX_LONG_DESC_LEN],
-            TEXT_NOT_USED,
-            True,
-            None,
-            None,
+            LONG_TEXT[10:MAX_LONG_DESC_LEN],
+            False,
+            LONG_TEXT[10 : MAX_DESC_LEN + 9] + "…",
+            LONG_TEXT[0:MAX_LONG_DESC_LEN],
         ),
     ],
 )

From b005ed25ca3e44474d6ce49c79d4d35989daea47 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 25 Jul 2023 10:50:50 +0200
Subject: [PATCH 44/45] Fix CHANGELOG to add latest changes

---
 CHANGELOG.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index adbabde..a5753f3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- Add `--long-description` CLI parameter to set ZIM long description
+- Add `--node-ids` CLI parameter to process only few channel nodes (_useful for debugging mostly_)
+
 ### Fixed
+- Fixed issue with ZIM description too long when sourced from channel metadata
+- Fixed issue with ZIM icon sizes / formats
 - Fix issue with ePub rendering which was outside the iframe
 - Description is now limited to expected lenght and long description is set
 - Icons and illustrations are squared as expected
@@ -17,11 +23,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Migrate to our new Python standard (hatch, ruff, pyright, ...)
 - Using zimscraperlib 3.1.1
-- Updated image to `python:3.11-bullseye`
+- Updated image to `python:3.11-bookworm`
 - Retry video reencoding up to three times
 - Move inline javascript to dedicated files
 - Move huge inline CSS to dedicated file
-- Add `--node-ids` CLI parameter to process only few nodes (useful for debugging)
 
 ## [1.0.1] - 2023-02-22
 
@@ -34,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [1.0.0] - 2021-11-11
 
+### Added
 - initial version
 - supports topic/document/audio/video/html5/exercise content types
 - uses libzim7

From 9e4a610e2bdab2ff15d222f8c4b4b802fb38ef12 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 25 Jul 2023 13:15:59 +0200
Subject: [PATCH 45/45] Use today() as it is already used somewhere else and
 sufficient

---
 pyproject.toml             | 4 ++--
 src/kolibri2zim/scraper.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 96879ec..659a2d7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -162,8 +162,8 @@ select = [
 ignore = [
   # Allow non-abstract empty methods in abstract base classes
   "B027",
-  # Allow use of datetime with tz and date.today
-  "DTZ005", "DTZ011",
+  # Allow use of date.today
+  "DTZ011",
   # Remove flake8-errmsg since we consider they bloat the code and provide limited value
   "EM",
   # Allow boolean positional values in function calls, like `dict.get(... True)`
diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py
index 412bcf4..e812305 100644
--- a/src/kolibri2zim/scraper.py
+++ b/src/kolibri2zim/scraper.py
@@ -927,7 +927,7 @@ def sanitize_inputs(self):
         channel_meta = self.db.get_channel_metadata(self.channel_id)
 
         # input  & metadata sanitation
-        period = datetime.datetime.now().strftime("%Y-%m")
+        period = datetime.date.today().strftime("%Y-%m")
         if self.fname:
             # make sure we were given a filename and not a path
             fname_path = Path(str(self.fname).format(period=period))