From e91b5d16e54ed569035a337e1773729357399bc3 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 08:58:02 +0200 Subject: [PATCH 01/45] Use standard .gitignore from toptal --- .gitignore | 249 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 245 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index fb4df5f..d4c96ac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,178 @@ +# Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,database,visualstudiocode,intellij +# Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,database,visualstudiocode,intellij +### Database ### +*.accdb +*.db +*.dbf +*.mdb +*.pdb +*.sqlite3 +*.db-shm +*.db-wal + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -21,7 +195,6 @@ parts/ sdist/ var/ wheels/ -pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg @@ -51,6 +224,7 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ +cover/ # Translations *.mo @@ -73,6 +247,7 @@ instance/ docs/_build/ # PyBuilder +.pybuilder/ target/ # Jupyter Notebook @@ -83,7 +258,9 @@ profile_default/ ipython_config.py # pyenv -.python-version +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -92,7 +269,22 @@ ipython_config.py # install all needed dependencies. #Pipfile.lock -# PEP 582; used by e.g. github.com/David-OConnor/pyflow +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff @@ -129,7 +321,50 @@ dmypy.json # Pyre type checker .pyre/ -.DS_Store +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,database,visualstudiocode,intellij + # assets that we download kolibri2zim/templates/assets/bootstrap/ @@ -143,3 +378,9 @@ kolibri2zim/templates/assets/epub.min.js kolibri2zim/templates/assets/bootstrap-icons/ kolibri2zim/templates/assets/jszip.min.js kolibri2zim/templates/assets/perseus/ + +# output dir +output + +# ignore all vscode, this is not standard configuration in this place +.vscode From 6afd9a98071af00f75dca210124e70c6583a3a08 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 08:58:47 +0200 Subject: [PATCH 02/45] Fail get_js_deps script on download / unzip errors --- get_js_deps.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/get_js_deps.sh b/get_js_deps.sh index ff10f77..9936269 100755 --- a/get_js_deps.sh +++ b/get_js_deps.sh @@ -1,5 +1,7 @@ #!/bin/sh +set -e + ### # download JS dependencies and place them in our templates/assets folder # then launch our ogv.js script to fix dynamic loading links From e043236f324c8ec90273e5a5b19083d56247e087 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 08:59:44 +0200 Subject: [PATCH 03/45] Migration version info to __about__.py to match standard --- kolibri2zim/VERSION | 1 - kolibri2zim/__about__.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 kolibri2zim/VERSION create mode 100644 kolibri2zim/__about__.py diff --git a/kolibri2zim/VERSION b/kolibri2zim/VERSION deleted file mode 100644 index 7dea76e..0000000 --- a/kolibri2zim/VERSION +++ /dev/null @@ -1 +0,0 @@ -1.0.1 diff --git a/kolibri2zim/__about__.py b/kolibri2zim/__about__.py new file mode 100644 index 0000000..f4c3a84 --- /dev/null +++ b/kolibri2zim/__about__.py @@ -0,0 +1 @@ +__version__ = "1.1.0-dev0" From 6caf0d467171f7622d23a02c2680bc9235ce766e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 09:00:30 +0200 Subject: [PATCH 04/45] Migrate to pyproject.toml + hatch instead of setuptools --- hatch_build.py | 16 +++++ pyproject.toml | 183 +++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 53 -------------- tasks.py | 76 ++++++++++++++++++++ 4 files changed, 275 insertions(+), 53 deletions(-) create mode 100644 hatch_build.py create mode 100644 pyproject.toml delete mode 100644 setup.py create mode 100644 tasks.py diff --git a/hatch_build.py b/hatch_build.py new file mode 100644 index 0000000..66b1f69 --- /dev/null +++ b/hatch_build.py @@ -0,0 +1,16 @@ +import logging +import subprocess +from pathlib import Path + +from hatchling.builders.hooks.plugin.interface import BuildHookInterface + +logger = logging.getLogger(__name__) + + +class GetJsDepsHook(BuildHookInterface): + def initialize(self, version, build_data): + subprocess.run( + Path(self.root).joinpath("get_js_deps.sh").as_posix(), # noqa : S603 + check=True, + ) + return super().initialize(version, build_data) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2f44f8a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,183 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "kolibri2zim" +authors = [{ name = "Kiwix", email = "dev@kiwix.org" }] +keywords = ["kiwix", "zim", "offline", "kolibri"] +requires-python = ">=3.11" +description = "Make ZIM file from Kolibri Channels" +readme = "README.md" +license = { text = "GPL-3.0-or-later" } +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", +] +dependencies = [ + "zimscraperlib==3.1.0", + "kiwixstorage==0.8.3", + "Jinja2==3.1.2", + "pif==0.8.2", + "beautifulsoup4==4.9.3", + "retrying==1.3.4", +] +dynamic = ["version"] + +[project.optional-dependencies] +scripts = ["invoke==2.1.3"] +lint = ["black==23.3.0", "ruff==0.0.272"] +check = ["pyright==1.1.317"] +dev = [ + "debugpy", + "kolibri2zim[scripts]", + "kolibri2zim[lint]", + "kolibri2zim[check]", +] + +[project.urls] +Homepage = "https://github.com/openzim/kolibri" +Donate = "https://www.kiwix.org/en/support-us/" + +[project.scripts] +kolibri2zim = "kolibri2zim:entrypoint.main" + +[tool.hatch.version] +path = "kolibri2zim/__about__.py" + +[tool.hatch.build] +exclude = ["/.github"] + +[tool.hatch.build.hooks.custom] + +[tool.hatch.envs.default] +features = ["dev"] + +[tool.hatch.envs.lint] +template = "lint" +python = "py311" +skip-install = false +features = ["scripts", "lint"] + +[tool.hatch.envs.lint.scripts] +black = "inv lint-black --args '{args}'" +ruff = "inv lint-ruff --args '{args}'" +all = "inv lintall --args '{args}'" +fix-black = "inv fix-black --args '{args}'" +fix-ruff = "inv fix-ruff --args '{args}'" +fixall = "inv fixall --args '{args}'" + +[tool.hatch.envs.check] +features = ["scripts", "check"] + +[tool.hatch.envs.check.scripts] +pyright = "inv check-pyright --args '{args}'" +all = "inv checkall --args '{args}'" + +[tool.black] +line-length = 88 +target-version = ['py311'] + +[tool.ruff] +target-version = "py311" +line-length = 88 +src = ["kolibri2zim"] +select = [ + # "A", # flake8-builtins + # "ANN", # flake8-annotations + "ARG", # flake8-unused-arguments + # "ASYNC", # flake8-async + # "B", # flake8-bugbear + # "BLE", # flake8-blind-except + "C4", # flake8-comprehensions + "C90", # mccabe + # "COM", # flake8-commas + # "D", # pydocstyle + # "DJ", # flake8-django + "DTZ", # flake8-datetimez + "E", # pycodestyle (default) + "EM", # flake8-errmsg + # "ERA", # eradicate + # "EXE", # flake8-executable + "F", # Pyflakes (default) + # "FA", # flake8-future-annotations + "FBT", # flake8-boolean-trap + # "FLY", # flynt + # "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + # "INP", # flake8-no-pep420 + # "INT", # flake8-gettext + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + # "NPY", # NumPy-specific rules + # "PD", # pandas-vet + # "PGH", # pygrep-hooks + # "PIE", # flake8-pie + # "PL", # Pylint + "PLC", # Pylint: Convention + "PLE", # Pylint: Error + "PLR", # Pylint: Refactor + "PLW", # Pylint: Warning + # "PT", # flake8-pytest-style + # "PTH", # flake8-use-pathlib + # "PYI", # flake8-pyi + "Q", # flake8-quotes + # "RET", # flake8-return + # "RSE", # flake8-raise + "RUF", # Ruff-specific rules + "S", # flake8-bandit + # "SIM", # flake8-simplify + # "SLF", # flake8-self + "T10", # flake8-debugger + "T20", # flake8-print + # "TCH", # flake8-type-checking + # "TD", # flake8-todos + "TID", # flake8-tidy-imports + # "TRY", # tryceratops + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["kolibri2zim"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.pyright] +pythonVersion = "3.11" +pythonPlatform = "All" +typeCheckingMode = "basic" + +include = ["kolibri2zim"] +exclude = ["**/node_modules", + "**/__pycache__", + "kolibri2zim/templates", +] diff --git a/setup.py b/setup.py deleted file mode 100644 index c6b1eab..0000000 --- a/setup.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - -import pathlib -import subprocess -from setuptools import setup - -root_dir = pathlib.Path(__file__).parent - - -def read(*names, **kwargs): - with open(root_dir.joinpath(*names), "r") as fh: - return fh.read() - - -print("Downloading and fixing JS dependencies...") -subprocess.run([str(root_dir.joinpath("get_js_deps.sh").resolve())], check=True) - - -setup( - name="kolibri2zim", - version=read("kolibri2zim", "VERSION").strip(), - description="Make ZIM file from Kolibri Channels", - long_description=read("README.md"), - long_description_content_type="text/markdown", - author="satyamtg", - author_email="io.satyamtg@gmail.com", - url="https://github.com/openzim/kolibri2zim", - keywords="kiwix zim offline kolibri", - license="GPLv3+", - packages=["kolibri2zim"], - install_requires=[ - line.strip() - for line in read("requirements.txt").splitlines() - if not line.strip().startswith("#") - ], - zip_safe=False, - include_package_data=True, - entry_points={ - "console_scripts": [ - "kolibri2zim=kolibri2zim.__main__:main", - ] - }, - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Programming Language :: Python", - "Programming Language :: Python :: 3.8", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - ], - python_requires=">=3.6", -) diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..424223b --- /dev/null +++ b/tasks.py @@ -0,0 +1,76 @@ +# pyright: strict, reportUntypedFunctionDecorator=false +import os + +from invoke.context import Context +from invoke.tasks import task # pyright: ignore [reportUnknownVariableType] + +use_pty = not os.getenv("CI", "") + + +@task( + optional=["args"], help={"args": "linting tools (black, ruff) additional arguments"} +) +def lint_black(ctx: Context, args: str | None = ""): + args = args or "." + ctx.run("black --version", pty=use_pty) + ctx.run(f"black --check --diff {args}", pty=use_pty) + + +@task( + optional=["args"], help={"args": "linting tools (black, ruff) additional arguments"} +) +def lint_ruff(ctx: Context, args: str | None = ""): + args = args or "." + ctx.run("ruff --version", pty=use_pty) + ctx.run(f"ruff check {args}", pty=use_pty) + + +@task( + optional=["args"], help={"args": "linting tools (black, ruff) additional arguments"} +) +def lintall(ctx: Context, args: str | None = ""): + """check linting""" + args = args or "." + lint_black(ctx, args) + lint_ruff(ctx, args) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def check_pyright(ctx: Context, args: str | None = ""): + """check static types with pyright""" + args = args or "" + ctx.run("pyright --version") + ctx.run(f"pyright {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def checkall(ctx: Context, args: str | None = ""): + """check static types""" + args = args or "" + check_pyright(ctx, args) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def fix_black(ctx: Context, args: str | None = ""): + """fix black formatting""" + args = args or "." + ctx.run(f"black {args}", pty=use_pty) # type: ignore + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def fix_ruff(ctx: Context, args: str | None = ""): + """fix all ruff rules""" + args = args or "." + ctx.run(f"ruff --fix {args}", pty=use_pty) # type: ignore + + +@task( + optional=["args"], + help={"args": "linting (fix mode) tools (black, ruff) additional arguments"}, +) +def fixall(ctx: Context, args: str | None = ""): + """fix everything automatically""" + args = args or "." + fix_black(ctx, args) + fix_ruff(ctx, args) + lintall(ctx, args) From 222727d1b67976dada9fb67beee626cec87e3eca Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 09:00:44 +0200 Subject: [PATCH 05/45] Add pre-commit configuration --- .pre-commit-config.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..577ac69 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/psf/black + rev: "23.3.0" + hooks: + - id: black +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.0.272 + hooks: + - id: ruff +- repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.315 + hooks: + - id: pyright + name: pyright (system) + description: 'pyright static type checker' + entry: pyright + language: system + 'types_or': [python, pyi] + require_serial: true + minimum_pre_commit_version: '2.9.2' From a2fabb1dc132e44a45494db2604869f410c1dd34 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 09:07:51 +0200 Subject: [PATCH 06/45] Fix simple issues raised by ruff --- dump_channel_to_fs.py | 9 +++---- kolibri2zim/__main__.py | 3 +-- kolibri2zim/constants.py | 5 ++-- kolibri2zim/database.py | 3 +-- kolibri2zim/debug.py | 7 +++--- kolibri2zim/entrypoint.py | 7 +++--- kolibri2zim/processing.py | 4 +-- kolibri2zim/scraper.py | 51 ++++++++++++++++++++++----------------- 8 files changed, 45 insertions(+), 44 deletions(-) diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py index ad066f7..298a9f0 100755 --- a/dump_channel_to_fs.py +++ b/dump_channel_to_fs.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu @@ -18,14 +17,14 @@ Uses wget for downloads """ +import contextlib +import logging +import multiprocessing as mp import os -import sys import pathlib -import logging import sqlite3 -import contextlib import subprocess -import multiprocessing as mp +import sys STUDIO_DEFAULT_BASE_URL = "https://studio.learningequality.org" STUDIO_URL = os.getenv("STUDIO_URL", STUDIO_DEFAULT_BASE_URL) diff --git a/kolibri2zim/__main__.py b/kolibri2zim/__main__.py index 03b42a7..5615eb7 100644 --- a/kolibri2zim/__main__.py +++ b/kolibri2zim/__main__.py @@ -1,9 +1,8 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import sys import pathlib +import sys def main(): diff --git a/kolibri2zim/constants.py b/kolibri2zim/constants.py index 578c8fc..3495c2a 100644 --- a/kolibri2zim/constants.py +++ b/kolibri2zim/constants.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import os -import pathlib import logging import multiprocessing +import os +import pathlib from zimscraperlib.logging import getLogger as lib_getLogger diff --git a/kolibri2zim/database.py b/kolibri2zim/database.py index 3ce6b3c..22f2d70 100644 --- a/kolibri2zim/database.py +++ b/kolibri2zim/database.py @@ -1,9 +1,8 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import pathlib import logging +import pathlib import sqlite3 logger = logging.getLogger(__name__) diff --git a/kolibri2zim/debug.py b/kolibri2zim/debug.py index b6fe2cd..66832e5 100644 --- a/kolibri2zim/debug.py +++ b/kolibri2zim/debug.py @@ -1,11 +1,10 @@ import io import logging import pathlib -from typing import Optional, Tuple import requests from retrying import retry -from zimscraperlib.download import stream_file, _get_retry_adapter +from zimscraperlib.download import _get_retry_adapter, stream_file from zimscraperlib.video.encoding import reencode logging.basicConfig(level=logging.DEBUG) @@ -41,8 +40,8 @@ def get_size_and_mime(url: str) -> Tuple[int, str]: @retry(stop_max_attempt_number=5, wait_exponential_multiplier=20000) def download_to( url: str, - fpath: Optional[pathlib.Path] = None, - byte_stream: Optional[io.IOBase] = None, + fpath: pathlib.Path | None = None, + byte_stream: io.BytesIO | None = None, ): logger.debug(f"download_to({url=}) {'to-file' if fpath else 'to-mem'}") stream_file(url, fpath=fpath, byte_stream=byte_stream) diff --git a/kolibri2zim/entrypoint.py b/kolibri2zim/entrypoint.py index 93f9f28..fc2f0f3 100755 --- a/kolibri2zim/entrypoint.py +++ b/kolibri2zim/entrypoint.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import sys import argparse +import sys -from .constants import NAME, SCRAPER, Global, getLogger, setDebug +from kolibri2zim.constants import NAME, SCRAPER, Global, get_logger, set_debug def main(): @@ -193,7 +192,7 @@ def main(): setDebug(args.debug) logger = getLogger() - from .scraper import Kolibri2Zim + from kolibri2zim.scraper import Kolibri2Zim try: scraper = Kolibri2Zim(**dict(args._get_kwargs())) diff --git a/kolibri2zim/processing.py b/kolibri2zim/processing.py index 6de0f4f..635f9aa 100644 --- a/kolibri2zim/processing.py +++ b/kolibri2zim/processing.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu from zimscraperlib.video.encoding import reencode @@ -27,7 +26,8 @@ def post_process_video(video_dir, video_id, preset, video_format, low_quality): raise FileNotFoundError(f"Missing video file in {video_dir}") if len(files) > 1: logger.warning( - f"Multiple video file candidates for {video_id} in {video_dir}. Picking {files[0]} out of {files}" + f"Multiple video file candidates for {video_id} in {video_dir}. " + f"Picking {files[0]} out of {files}" ) src_path = files[0] diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 28d59c1..5fa208c 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -1,36 +1,40 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import io -import shutil import base64 -import zipfile +import concurrent.futures as cf import datetime -import tempfile -import threading import hashlib +import io import json +import shutil +import tempfile +import threading +import zipfile from pathlib import Path -from typing import Optional -import concurrent.futures as cf +from typing import Any import jinja2 from bs4 import BeautifulSoup -from pif import get_public_ip from kiwixstorage import KiwixStorage -from zimscraperlib.zim.creator import Creator -from zimscraperlib.zim.items import StaticItem +from pif import get_public_ip +from zimscraperlib.filesystem import get_file_mimetype from zimscraperlib.i18n import find_language_names -from zimscraperlib.inputs import handle_user_provided_file from zimscraperlib.image.convertion import convert_image, create_favicon from zimscraperlib.image.transformation import resize_image -from zimscraperlib.filesystem import get_file_mimetype -from zimscraperlib.video.presets import VideoWebmLow, VideoWebmHigh, VideoMp4Low +from zimscraperlib.inputs import handle_user_provided_file +from zimscraperlib.video.presets import VideoMp4Low, VideoWebmHigh, VideoWebmLow +from zimscraperlib.zim.creator import Creator +from zimscraperlib.zim.items import StaticItem -from .constants import ROOT_DIR, getLogger, STUDIO_URL -from .database import KolibriDB -from .debug import ON_DISK_THRESHOLD, download_to, get_size_and_mime, safer_reencode +from kolibri2zim.constants import ROOT_DIR, STUDIO_URL, get_logger +from kolibri2zim.database import KolibriDB +from kolibri2zim.debug import ( + ON_DISK_THRESHOLD, + download_to, + get_size_and_mime, + safer_reencode, +) logger = getLogger() options = [ @@ -58,7 +62,7 @@ "about", "css", "dedup_html_files", - "node_ids" + "node_ids", ] NOSTREAM_FUNNEL_SIZE = 1024 # 2**20 * 2 # 2MiB @@ -198,7 +202,12 @@ def funnel_file(self, fid, fext): url, fname = get_kolibri_url_for(fid, fext) size, mimetype = get_size_and_mime(url) - item_kw = dict(path=fname, title="", mimetype=mimetype, delete_fpath=True) + item_kw = { + "path": fname, + "title": "", + "mimetype": mimetype, + "delete_fpath": True, + } if not size or size >= ON_DISK_THRESHOLD: item_kw["fpath"] = Path( @@ -347,7 +356,6 @@ def add_video_node(self, node_id): # funnel from S3 cache if it is present there if not self.funnel_from_s3(vfid, path, vchk, preset): - # download original video src = self.download_to_disk(vid, video_file["ext"]) dst = src.with_suffix(".webm") @@ -365,7 +373,6 @@ def add_video_node(self, node_id): # funnel from S3 cache if it is present there if not self.funnel_from_s3(vfid, path, vchk, preset): - # download original video src = self.download_to_disk(vid, video_file["ext"]) @@ -905,7 +912,7 @@ def sanitize_inputs(self): self.publisher = "Openzim" self.publisher = self.publisher.strip() - self.tags = list(set(self.tags + ["_category:other", "kolibri", "_videos:yes"])) + self.tags = list({*self.tags, "_category:other", "kolibri", "_videos:yes"}) def retrieve_favicon(self): favicon_orig = self.build_dir / "favicon" From bc5785d03e79bb2633b774dc94eb43a6ced441b4 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 09:10:28 +0200 Subject: [PATCH 07/45] Fix subtle issues raised by pyright / ruff --- dump_channel_to_fs.py | 39 +++++-- kolibri2zim/__main__.py | 2 +- kolibri2zim/constants.py | 9 +- kolibri2zim/database.py | 44 ++++--- kolibri2zim/debug.py | 6 +- kolibri2zim/entrypoint.py | 4 +- kolibri2zim/processing.py | 8 +- kolibri2zim/scraper.py | 233 ++++++++++++++++++++++++-------------- 8 files changed, 216 insertions(+), 129 deletions(-) diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py index 298a9f0..87ab761 100755 --- a/dump_channel_to_fs.py +++ b/dump_channel_to_fs.py @@ -35,7 +35,7 @@ logger = logging.getLogger("dump-remote") -def download_if_missing(url, fpath, fsize=None, force=False): +def download_if_missing(url, fpath, fsize=None, *, force=False): skipped = ( fpath.exists() and (fsize is not None and os.path.getsize(fpath) == fsize) @@ -44,7 +44,7 @@ def download_if_missing(url, fpath, fsize=None, force=False): if not skipped: fpath.unlink(missing_ok=True) wget = subprocess.run( - [ + [ # noqa: S603 "/usr/bin/env", "wget", "-t", @@ -62,7 +62,8 @@ def download_if_missing(url, fpath, fsize=None, force=False): ) if wget.returncode != 0: logger.error(wget.stdout) - raise Exception(f"wget exited with retcode {wget.returncode}") + msg = f"wget exited with retcode {wget.returncode}" + raise Exception(msg) return not skipped, url, fpath @@ -89,12 +90,11 @@ def get_rows(db_path, query): cursor = conn.execute(query) rows = cursor.fetchmany() while rows: - for row in rows: - yield row + yield from rows rows = cursor.fetchmany() -def dump(channel_id, build_dir=None, force=False): +def dump(channel_id: str, build_dir: str | None, *, force: bool): build_path = pathlib.Path(build_dir or "build") logger.info(f"dumping {channel_id} into {build_path}") build_path.mkdir(exist_ok=True, parents=True) @@ -112,7 +112,7 @@ def dump(channel_id, build_dir=None, force=False): nb_files = get_single_value(db_path, "SELECT COUNT(*) FROM content_file") logger.info(f"Looping over all {nb_files} files") - def on_error(*args, **kwargs): + def on_error(*args, **kwargs): # noqa: ARG001 logger.error("Failed to download something") def on_success(result): @@ -145,8 +145,27 @@ def on_success(result): logger.info("Done downloading files") +CHANNEL_ID_POS_IN_ARGV = 2 +BUILD_DIR_POS_IN_ARGV = 3 +FORCE_POS_IN_ARGV = 4 + if __name__ == "__main__": - if len(sys.argv) < 2: - print("Missing channel ID") + if len(sys.argv) < CHANNEL_ID_POS_IN_ARGV: + logger.error("Missing channel ID") sys.exit(1) - dump(*sys.argv[1:]) + + channel_id = sys.argv[1] + if len(sys.argv) >= BUILD_DIR_POS_IN_ARGV: + build_dir = sys.argv[2] + else: + build_dir = None + if len(sys.argv) >= FORCE_POS_IN_ARGV: + force = ( + sys.argv[3].lower() == "true" + or sys.argv[3].lower() == "force" + or sys.argv[3].lower() == "yes" + ) + else: + force = False + + dump(channel_id=channel_id, build_dir=build_dir, force=force) diff --git a/kolibri2zim/__main__.py b/kolibri2zim/__main__.py index 5615eb7..c595cdc 100644 --- a/kolibri2zim/__main__.py +++ b/kolibri2zim/__main__.py @@ -7,7 +7,7 @@ def main(): # allows running it from source using python kolibri2zim - sys.path = [str(pathlib.Path(__file__).parent.parent.resolve())] + sys.path + sys.path = [str(pathlib.Path(__file__).parent.parent.resolve()), *sys.path] from kolibri2zim.entrypoint import main as entry diff --git a/kolibri2zim/constants.py b/kolibri2zim/constants.py index 3495c2a..6d7a7bb 100644 --- a/kolibri2zim/constants.py +++ b/kolibri2zim/constants.py @@ -11,7 +11,7 @@ ROOT_DIR = pathlib.Path(__file__).parent NAME = ROOT_DIR.name -with open(ROOT_DIR.joinpath("VERSION"), "r") as fh: +with open(ROOT_DIR.joinpath("VERSION")) as fh: VERSION = fh.read().strip() SCRAPER = f"{NAME} {VERSION}" @@ -25,7 +25,7 @@ def is_running_inside_container(): if not fpath.exists(): return False try: - with open(fpath, "r") as fh: + with open(fpath) as fh: for line in fh.readlines(): if line.strip().rsplit(":", 1)[-1] != "/": return True @@ -37,6 +37,7 @@ def is_running_inside_container(): class Global: debug = False inside_container = is_running_inside_container() + nb_available_cpus: int Global.nb_available_cpus = ( @@ -44,11 +45,11 @@ class Global: ) -def setDebug(debug): +def set_debug(debug): """toggle constants global DEBUG flag (used by getLogger)""" Global.debug = bool(debug) -def getLogger(): +def get_logger(): """configured logger respecting DEBUG flag""" return lib_getLogger(NAME, level=logging.DEBUG if Global.debug else logging.INFO) diff --git a/kolibri2zim/database.py b/kolibri2zim/database.py index 22f2d70..5c0f0d2 100644 --- a/kolibri2zim/database.py +++ b/kolibri2zim/database.py @@ -22,7 +22,7 @@ class KolibriDB: Kolibri uses the Modified Preorder Tree Traversal model, from django-mptt https://gist.github.com/tmilos/f2f999b5839e2d42d751""" - def __init__(self, fpath: pathlib.Path, root_id: str = None): + def __init__(self, fpath: pathlib.Path, root_id: str | None = None): self.conn = sqlite3.connect( f"file:{fpath.expanduser().resolve()}?mode=ro", uri=True, @@ -36,7 +36,8 @@ def __init__(self, fpath: pathlib.Path, root_id: str = None): self.root = self.get_node(root_id) if not self.root: - raise ValueError(f"No node for root-id {root_id}") + msg = f"No node for root-id {root_id}" + raise ValueError(msg) @property def fpath(self): @@ -73,8 +74,7 @@ def get_rows(self, query, *args, **kwargs): cursor = conn.execute(query, *args, **kwargs) rows = cursor.fetchmany() while rows: - for row in rows: - yield row + yield from rows rows = cursor.fetchmany() def get_channel_metadata(self, channel_id): @@ -94,8 +94,7 @@ def get_node_descendants(self, node_id, left=None, right=None): "ORDER BY level ASC", (left, right), ): - row = dict(row) - yield row + yield dict(row) def get_node_children(self, node_id, left=None, right=None): if left is None or right is None: @@ -110,17 +109,17 @@ def get_node_children(self, node_id, left=None, right=None): "ORDER BY level ASC", (left, right, node_id), ): - row = dict(row) - row.update( + rowdict = dict(row) + rowdict.update( { - "thumbnail": self.get_thumbnail_name(row["id"]), + "thumbnail": self.get_thumbnail_name(rowdict["id"]), } ) - yield row + yield rowdict def get_node_children_count(self, node_id, left=None, right=None): if left is None or right is None: - node = self.get_node(with_parents=False, with_children=False) + node = self.get_node(node_id, with_parents=False, with_children=False) left = node["left"] right = node["right"] @@ -132,7 +131,7 @@ def get_node_children_count(self, node_id, left=None, right=None): def get_node_parents(self, node_id, left=None, right=None): if left is None or right is None: - node = self.get_node(with_parents=False, with_children=False) + node = self.get_node(node_id, with_parents=False, with_children=False) left = node["left"] right = node["right"] @@ -147,7 +146,7 @@ def get_node_parents(self, node_id, left=None, right=None): def get_node_parents_count(self, node_id, left=None, right=None): if left is None or right is None: - node = self.get_node(with_parents=False, with_children=False) + node = self.get_node(node_id, with_parents=False, with_children=False) left = node["left"] right = node["right"] @@ -159,7 +158,7 @@ def get_node_parents_count(self, node_id, left=None, right=None): (left, right, self.root_left, self.root_right), ) - def get_node(self, node_id, with_parents=False, with_children=False): + def get_node(self, node_id, *, with_parents=False, with_children=False): node = self.get_row( "SELECT id, title, description, author, level, kind, " "license_name as license, license_owner, " @@ -195,13 +194,13 @@ def get_node(self, node_id, with_parents=False, with_children=False): ) return node - def get_node_file(self, node_id, thumbnail=False): + def get_node_file(self, node_id, *, thumbnail=False): try: - return next(self.get_node_files(node_id, thumbnail)) + return next(self.get_node_files(node_id, thumbnail=thumbnail)) except StopIteration: return None - def get_node_files(self, node_id, thumbnail=False): + def get_node_files(self, node_id, *, thumbnail=False): for row in self.get_rows( "SELECT id as fid, local_file_id as id, " "extension as ext, priority as prio, " @@ -210,7 +209,16 @@ def get_node_files(self, node_id, thumbnail=False): "ORDER BY priority ASC", (node_id, 1, 1 if thumbnail else 0), ): - yield dict(row) + yield { + "id": row["id"], + "fid": row["fid"], + "ext": row["ext"], + "prio": row["prio"], + "supp": row["supp"], + "checksum": row["checksum"], + "lang": row["lang"], + "preset": row["preset"], + } def get_node_thumbnail(self, node_id): return self.get_node_file(node_id, thumbnail=True) diff --git a/kolibri2zim/debug.py b/kolibri2zim/debug.py index 66832e5..0077113 100644 --- a/kolibri2zim/debug.py +++ b/kolibri2zim/debug.py @@ -18,9 +18,11 @@ # retry up to 3 times, with delay from 40s @retry(stop_max_attempt_number=3, wait_exponential_multiplier=20000) -def get_size_and_mime(url: str) -> Tuple[int, str]: +def get_size_and_mime(url: str) -> tuple[int | None, str]: logger.debug(f"get_size_and_mime({url=})") - _, headers = stream_file(url, byte_stream=io.BytesIO(), only_first_block=True) + _, headers = stream_file( + url, byte_stream=io.BytesIO(), only_first_block=True + ) # type: ignore # see https://github.com/openzim/python-scraperlib/issues/104 mimetype = headers.get("Content-Type", "application/octet-stream") # Encoded data (compressed) prevents us from using Content-Length header # as source for the content (it represents length of compressed data) diff --git a/kolibri2zim/entrypoint.py b/kolibri2zim/entrypoint.py index fc2f0f3..98576e8 100755 --- a/kolibri2zim/entrypoint.py +++ b/kolibri2zim/entrypoint.py @@ -189,8 +189,8 @@ def main(): ) args = parser.parse_args() - setDebug(args.debug) - logger = getLogger() + set_debug(args.debug) + logger = get_logger() from kolibri2zim.scraper import Kolibri2Zim diff --git a/kolibri2zim/processing.py b/kolibri2zim/processing.py index 635f9aa..49a3088 100644 --- a/kolibri2zim/processing.py +++ b/kolibri2zim/processing.py @@ -3,10 +3,9 @@ from zimscraperlib.video.encoding import reencode -from .constants import getLogger +from kolibri2zim.constants import get_logger - -logger = getLogger() +logger = get_logger() def post_process_video(video_dir, video_id, preset, video_format, low_quality): @@ -23,7 +22,8 @@ def post_process_video(video_dir, video_id, preset, video_format, low_quality): if len(files) == 0: logger.error(f"Video file missing in {video_dir} for {video_id}") logger.debug(list(video_dir.iterdir())) - raise FileNotFoundError(f"Missing video file in {video_dir}") + msg = f"Missing video file in {video_dir}" + raise FileNotFoundError(msg) if len(files) > 1: logger.warning( f"Multiple video file candidates for {video_id} in {video_dir}. " diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 5fa208c..ac9bbc0 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -36,7 +36,7 @@ safer_reencode, ) -logger = getLogger() +logger = get_logger() options = [ "debug", "name", @@ -79,20 +79,34 @@ def get_kolibri_url_for(file_id: str, ext: str): return f"{STUDIO_URL}/content/storage/{remote_path}", fname -def read_from_zip(ark, member, as_text: Optional[bool] = True): - data = ark.open(member).read() - return data.decode("utf-8") if as_text else data +def read_from_zip_as_bytes(ark, member): + return ark.open(member).read() + + +def read_from_zip_as_text(ark, member): + return read_from_zip_as_bytes(ark, member).decode("utf-8") class Kolibri2Zim: def __init__(self, **kwargs): - for option in options: if option not in kwargs: - raise ValueError(f"Missing parameter `{option}`") + msg = f"Missing parameter `{option}`" + raise ValueError(msg) def go(option): - return kwargs.get(option) + res = kwargs.get(option) + if type(res) is str: + return res + else: + return None + + def gom(option): + res = go(option) + if not res: + msg = f"Unexpected kind of option for {option}" + raise Exception(msg) + return res self.channel_id = go("channel_id") self.root_id = go("root_id") @@ -104,9 +118,11 @@ def go(option): # zim params self.fname = go("fname") - self.tags = ( - [] if go("tags") is None else [t.strip() for t in go("tags").split(",")] - ) + tags = go("tags") + if tags is None: + self.tags = [] + else: + self.tags = [t.strip() for t in tags.split(",")] self.title = go("title") self.description = go("description") self.author = go("creator") @@ -119,14 +135,14 @@ def go(option): self.css = go("css") # directory setup - self.output_dir = Path(go("output_dir")).expanduser().resolve() + self.output_dir = Path(str(go("output_dir"))).expanduser().resolve() if go("tmp_dir"): - Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True) + Path(str(go("tmp_dir"))).mkdir(parents=True, exist_ok=True) self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir"))) # performances options - self.nb_threads = go("threads") - self.nb_processes = go("processes") + self.nb_threads = int(gom("threads")) + self.nb_processes = int(gom("processes")) self.s3_url_with_credentials = go("s3_url_with_credentials") self.s3_storage = None self.dedup_html_files = go("dedup_html_files") @@ -136,8 +152,9 @@ def go(option): self.keep_build_dir = go("keep_build_dir") self.debug = go("debug") self.only_topics = go("only_topics") + node_ids = go("node_ids") self.node_ids = ( - None if go("node_ids") is None else [t.strip() for t in go("node_ids").split(",")] + None if node_ids is None else [t.strip() for t in node_ids.split(",")] ) # jinja2 environment setup @@ -265,12 +282,10 @@ def funnel_from_s3(self, file_id, path, checksum, preset): # add to zim with self.creator_lock: - self.creator.add_item( - StaticItem( - path=path, - fileobj=fileobj, - mimetype=preset.mimetype, - ) + self.creator.add_item_for( + path=path, + content=fileobj.read(), + mimetype=preset.mimetype, ) logger.debug(f"Added {path} from S3::{key}") return True @@ -305,7 +320,10 @@ def add_topic_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, title=node["title"], content=html, mimetype="text/html" + path=node_id, + title=node["title"], + content=html.encode(), + mimetype="text/html", ) logger.debug(f"Added topic #{node_id}") @@ -317,24 +335,23 @@ def add_video_node(self, node_id): subtitle files (`video_subtitle`) are VTT files and are only limited by the number of language to select from in kolibri studio""" - files = self.db.get_node_files(node_id, thumbnail=False) - if not files: + files = list(self.db.get_node_files(node_id, thumbnail=False)) + if len(files) == 0: return files = sorted(files, key=lambda f: f["prio"]) - it = filter(lambda f: f["supp"] == 0, files) + it: list[dict[str, Any]] = list(filter(lambda f: f["supp"] == 0, files)) - try: - # find main video file - video_file = next(it) - except StopIteration: + if len(it) == 0: # we have no video file return - - try: - alt_video_file = next(it) - except StopIteration: + elif len(it) == 1: # we have no supplementary video file (which is OK) + video_file = it[0] alt_video_file = None + else: + # we have video and alt video + video_file = it[0] + alt_video_file = it[1] # now decide which file to keep and what to do with it @@ -388,7 +405,9 @@ def add_video_node(self, node_id): # we want mp4, either in high-q or we have a low_res file to use else: - video_file = alt_video_file if self.low_quality else video_file + video_file = ( + alt_video_file if self.low_quality and alt_video_file else video_file + ) self.funnel_file(video_file["id"], video_file["ext"]) video_filename = filename_for(video_file) video_filename_ext = video_file["ext"] @@ -401,14 +420,14 @@ def add_video_node(self, node_id): local, english = find_language_names(file["lang"]) except Exception: english = file["lang"] - finally: - subtitles.append( - { - "code": file["lang"], - "name": english, - "filename": filename_for(file), - } - ) + + subtitles.append( + { + "code": file["lang"], + "name": english, + "filename": filename_for(file), + } + ) node = self.db.get_node(node_id, with_parents=True) html = self.jinja2_env.get_template("video.html").render( @@ -424,7 +443,7 @@ def add_video_node(self, node_id): self.creator.add_item_for( path=node_id, title=node["title"], - content=html, + content=html.encode(), mimetype="text/html", ) logger.debug(f"Added video #{node_id}") @@ -435,7 +454,10 @@ def add_video_upon_completion(self, future): logs error in case of failure""" if future.cancelled(): return - src_fname, dst_fpath, path = self.videos_futures.get(future) + res = self.videos_futures.get(future) + if not res: + return + src_fname, dst_fpath, path = res try: future.result() @@ -494,7 +516,10 @@ def request_s3_upload_and_removal(self, item): """add file from item to uploads list""" path = item.path del item - dest_fpath, key, meta = self.pending_upload.get(path) + res = self.pending_upload.get(path) + if not res: + return + dest_fpath, key, meta = res # TODO: submit to a thread executor (to create) instead # this is currently called on main-tread. self.upload_to_s3(key, dest_fpath, **meta) @@ -521,7 +546,7 @@ def add_audio_node(self, node_id): self.creator.add_item_for( path=node_id, title=node["title"], - content=html, + content=html.encode(), mimetype="text/html", ) logger.debug(f"Added audio #{node_id}") @@ -554,7 +579,7 @@ def add_exercise_node(self, node_id): if manifest_name not in zip_ark.namelist(): logger.error(f"Excercise node without {manifest_name}") return - manifest = json.loads(read_from_zip(zip_ark, manifest_name)) + manifest = json.loads(read_from_zip_as_bytes(zip_ark, manifest_name)) # copy exercise content, rewriting internal paths # all internal resources to be stored under {node_id}/ prefix @@ -562,14 +587,14 @@ def add_exercise_node(self, node_id): for assessment_item in manifest.get("all_assessment_items", []): item_path = f"{assessment_item}.json" if item_path in zip_ark.namelist(): - perseus_content = read_from_zip(zip_ark, item_path) + perseus_content = read_from_zip_as_text(zip_ark, item_path) perseus_content = perseus_content.replace( r"web+graphie:${☣ LOCALPATH}", f"web+graphie:./{node_id}" ) perseus_content = perseus_content.replace( r"${☣ LOCALPATH}", f"./{node_id}" ) - assessment_items.append(perseus_content) + assessment_items.append(perseus_content) # add all support files to ZIM for ark_member in zip_ark.namelist(): @@ -581,12 +606,12 @@ def add_exercise_node(self, node_id): self.creator.add_item_for( path=path, title="", - content=read_from_zip(zip_ark, ark_member, as_text=False), + content=read_from_zip_as_bytes(zip_ark, ark_member), ) logger.debug(f"Added exercise support file {path}") # prepare and add exercise HTML article - node = self.db.get_node(node_id, with_parents=True) + node = self.db.get_node(node_id, with_parents=True, with_children=False) html = self.jinja2_env.get_template("perseus_exercise.html").render( node_id=node_id, perseus_content=f"[{', '.join(assessment_items)}]", @@ -595,7 +620,10 @@ def add_exercise_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, title=node["title"], content=html, mimetype="text/html" + path=node_id, + title=node["title"], + content=html.encode(), + mimetype="text/html", ) logger.debug(f"Added exercise node #{node_id}") @@ -671,7 +699,7 @@ def target_for(file): self.creator.add_item_for( path=path, title=node["title"], - content=html, + content=html.encode(), mimetype="text/html", ) logger.debug(f"Added document #{node_id}") @@ -709,7 +737,7 @@ def add_html5_node(self, node_id): # calculate hash of file and add entry if not in zim already content = zip_ark.open(ark_member).read() - content_hash = hashlib.md5(content).hexdigest() # nosec + content_hash = hashlib.md5(content).hexdigest() # nosec # noqa: S324 if content_hash not in self.html_files_cache: self.html_files_cache.append(content_hash) @@ -730,7 +758,8 @@ def add_html5_node(self, node_id): def run(self): if self.s3_url_with_credentials and not self.s3_credentials_ok(): - raise ValueError("Unable to connect to Optimization Cache. Check its URL.") + msg = "Unable to connect to Optimization Cache. Check its URL." + raise ValueError(msg) s3_msg = ( f" using cache: {self.s3_storage.url.netloc} " @@ -770,19 +799,34 @@ def run(self): self.output_dir.mkdir(parents=True, exist_ok=True) self.creator_lock = threading.Lock() + if not self.root_id: + logger.error("Missing root id") + return 1 + if not self.title: + logger.error("Missing title") + return 1 + if not self.description: + logger.error("Missing description") + return 1 + if not self.author: + logger.error("Missing author") + return 1 + if not self.publisher: + logger.error("Missing publisher") + return 1 self.creator = Creator( - filename=self.output_dir.joinpath(self.fname), + filename=self.output_dir.joinpath(self.clean_fname), main_path=self.root_id, ignore_duplicates=True, ) self.creator.config_metadata( - Name=self.name, + Name=self.clean_fname, Language="eng", Title=self.title, Description=self.description, Creator=self.author, Publisher=self.publisher, - Date=datetime.date.today().strftime("%Y-%d-%m"), + Date=datetime.datetime.now(datetime.UTC).strftime("%Y-%d-%m"), Illustration_48x48_at_1=self.favicon_fpath.read_bytes(), ) self.creator.start() @@ -830,8 +874,9 @@ def run(self): f"FAILURE not_done={len(result.not_done)} done={len(result.done)}" ) for future in result.done: - if future.exception(): - raise future.exception() + future_exception = future.exception() + if future_exception: + raise future_exception except KeyboardInterrupt: self.creator.can_finish = False logger.error("KeyboardInterrupt, exiting.") @@ -887,18 +932,21 @@ def sanitize_inputs(self): channel_meta = self.db.get_channel_metadata(self.channel_id) # input & metadata sanitation - period = datetime.datetime.now().strftime("%Y-%m") + period = datetime.datetime.now(datetime.UTC).strftime("%Y-%m") if self.fname: # make sure we were given a filename and not a path - self.fname = Path(self.fname.format(period=period)) - if Path(self.fname.name) != self.fname: - raise ValueError(f"filename is not a filename: {self.fname}") + fname_path = Path(self.fname.format(period=period)).resolve() + if not fname_path.is_file(): + msg = f"filename is not a filename: {self.fname}" + raise ValueError(msg) + self.clean_fname = fname_path.as_posix() else: - self.fname = f"{self.name}_{period}.zim" + self.clean_fname = f"{self.name}_{period}.zim" if not self.title: - self.title = channel_meta["name"] - self.title = self.title.strip() + self.clean_title = channel_meta["name"].strip() + else: + self.clean_title = self.title.strip() if not self.description: self.description = channel_meta["description"] @@ -969,18 +1017,24 @@ def add_custom_about_and_css(self): if self.about: # if user provided a custom about page, use it - with open( - handle_user_provided_file( - source=self.about, in_dir=self.build_dir, nocopy=True - ), - "r", - ) as fh: - soup = BeautifulSoup(fh.read(), "lxml") - title = soup.find("title").text - content = soup.select("body > .container") - # we're only interested in the first one - if isinstance(content, list): - content = content[0] + user_provided_file = handle_user_provided_file( + source=self.about, in_dir=self.build_dir, nocopy=True + ) + if not user_provided_file: + title = channel_meta["name"] + content = None + else: + with open(user_provided_file) as fh: + soup = BeautifulSoup(fh.read(), "lxml") + title = soup.find("title") + if not title: + msg = "Failed to extract title" + raise Exception(msg) + title = title.text + content = soup.select("body > .container") + # we're only interested in the first one + if isinstance(content, list): + content = content[0] else: title = channel_meta["name"] content = None @@ -992,23 +1046,26 @@ def add_custom_about_and_css(self): self.creator.add_item_for( path="about", title=title, - content=html, + content=html.encode(), mimetype="text/html", ) del html # if user provided a custom CSS file, use it if self.css: - with open( - handle_user_provided_file( - source=self.css, in_dir=self.build_dir, nocopy=True - ), - "r", - ) as fh: - content = fh.read() + user_provided_file = handle_user_provided_file( + source=self.css, in_dir=self.build_dir, nocopy=True + ) + if not user_provided_file: + content = "" + else: + with open(user_provided_file) as fh: + content = fh.read() # otherwise, create a blank one else: content = "" - self.creator.add_item_for("custom.css", content=content, mimetype="text/css") + self.creator.add_item_for( + "custom.css", content=content.encode(), mimetype="text/css" + ) logger.debug("Added about page and custom CSS") From e1686d0e6e670ca5c2bc3b340df14e62e304946a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 09:16:19 +0200 Subject: [PATCH 08/45] Adapt CHANGELOG + fix typo --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6eb19cd..bfc2123 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## Fixed +### Fixed - Fix issue with ePub rendering which was outside the iframe +- Many small fixes (including some bugs) detected by ruff / pyright ### Changed +- Migrate to our new Python standard (hatch, ruff, pyright, ...) - Using zimscraperlib 3.1.0 - Updated image to `python:3.11-bullseye` - Retry video reencoding up to three times From 1777e57bb0f6d2fbdec583f2b322c97263c3dac5 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Jul 2023 09:30:31 +0200 Subject: [PATCH 09/45] Add QA worfklow + publish on releases only, to PyPi in addition to Docker --- .github/workflows/docker.yml | 29 ------------------- .github/workflows/publish.yml | 52 +++++++++++++++++++++++++++++++++++ .github/workflows/pull.yml | 10 +++++++ .github/workflows/push.yml | 8 ++++++ .github/workflows/qa.yml | 34 +++++++++++++++++++++++ 5 files changed, 104 insertions(+), 29 deletions(-) delete mode 100644 .github/workflows/docker.yml create mode 100644 .github/workflows/publish.yml create mode 100644 .github/workflows/pull.yml create mode 100644 .github/workflows/push.yml create mode 100644 .github/workflows/qa.yml diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml deleted file mode 100644 index 2ca5fb4..0000000 --- a/.github/workflows/docker.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Docker - -on: - push: - branches: - - main - tags: - - v* - -jobs: - build-and-push: - name: Deploy Docker Image - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3.4.0 - - name: Build and push - uses: openzim/docker-publish-action@v10 - with: - image-name: openzim/kolibri - on-master: dev - tag-pattern: /^v([0-9.]+)$/ - latest-on-tag: true - restrict-to: openzim/kolibri - registries: ghcr.io - credentials: - GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} - GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} - repo_description: auto - repo_overview: auto diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..fd9d884 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,52 @@ +name: Build and upload to PyPI + +on: + release: + types: [published] + +jobs: + publish: + runs-on: ubuntu-22.04 + permissions: + id-token: write # mandatory for PyPI trusted publishing + + steps: + - uses: actions/checkout@v3.5.3 + + - name: Set up Python 3.11 + uses: actions/setup-python@v4.6.1 + with: + python-version: "3.11" + architecture: x64 + + - name: Build packages + run: | + pip install -U pip build + python -m build sdist wheel + + - name: Upload to PyPI + uses: pypa/gh-action-pypi-publish@v1.8.6 + # dont specify anything for Trusted Publishing + # https://docs.pypi.org/trusted-publishers + # with: + # # Using token + # user: __token__ + # password: ${{ secrets.PYPI_API_TOKEN }} + # + # # Using token on test index + # password: ${{ secrets.PYPI_TEST_API_TOKEN }} + # repository_url: https://test.pypi.org/legacy/ + + - name: Build and push Docker image + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/kolibri + on-master: dev + tag-pattern: /^v([0-9.]+)$/ + latest-on-tag: true + restrict-to: openzim/kolibri + registries: ghcr.io + credentials: GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml new file mode 100644 index 0000000..be2b9de --- /dev/null +++ b/.github/workflows/pull.yml @@ -0,0 +1,10 @@ +name: Pull Request + +on: + pull_request: + +jobs: + qa: + uses: ./.github/workflows/qa.yml + # run qa job if the pull request originates from a fork (otherwise the qa is already triggered by the push to a branch) + if: github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml new file mode 100644 index 0000000..d84f392 --- /dev/null +++ b/.github/workflows/push.yml @@ -0,0 +1,8 @@ +name: Push + +on: + push: + +jobs: + qa: + uses: ./.github/workflows/qa.yml diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml new file mode 100644 index 0000000..8927ab5 --- /dev/null +++ b/.github/workflows/qa.yml @@ -0,0 +1,34 @@ +name: QA + +on: + workflow_call: + +jobs: + check-qa: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3.5.3 + + - name: Set up Python 3.11 + uses: actions/setup-python@v4.6.1 + with: + python-version: "3.11" + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[lint,scripts,test] + + - name: Check black formatting + run: inv lint-black + + - name: Check ruff + run: inv lint-ruff + + # Installs and run pyright (node). Easier/faster alt. to: inv check-pyright + - name: Check with pyright + uses: jakebailey/pyright-action@v1.6.0 + with: + version: 1.1.311 From 8a87e34bf3f24b9c78ab853d9428c035937b1982 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 18 Jul 2023 18:04:32 +0200 Subject: [PATCH 10/45] WIP --- .github/workflows/publish.yml | 15 +++------------ .github/workflows/qa.yml | 9 +++------ dump_channel_to_fs.py | 3 +-- kolibri2zim/database.py | 3 +-- kolibri2zim/processing.py | 3 +-- kolibri2zim/scraper.py | 15 +++++---------- pyproject.toml | 1 + 7 files changed, 15 insertions(+), 34 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index fd9d884..ef88218 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -16,7 +16,7 @@ jobs: - name: Set up Python 3.11 uses: actions/setup-python@v4.6.1 with: - python-version: "3.11" + python-version-file: "pyproject.toml" architecture: x64 - name: Build packages @@ -26,16 +26,6 @@ jobs: - name: Upload to PyPI uses: pypa/gh-action-pypi-publish@v1.8.6 - # dont specify anything for Trusted Publishing - # https://docs.pypi.org/trusted-publishers - # with: - # # Using token - # user: __token__ - # password: ${{ secrets.PYPI_API_TOKEN }} - # - # # Using token on test index - # password: ${{ secrets.PYPI_TEST_API_TOKEN }} - # repository_url: https://test.pypi.org/legacy/ - name: Build and push Docker image uses: openzim/docker-publish-action@v10 @@ -46,7 +36,8 @@ jobs: latest-on-tag: true restrict-to: openzim/kolibri registries: ghcr.io - credentials: GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} repo_description: auto repo_overview: auto diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml index 8927ab5..abf892b 100644 --- a/.github/workflows/qa.yml +++ b/.github/workflows/qa.yml @@ -13,7 +13,7 @@ jobs: - name: Set up Python 3.11 uses: actions/setup-python@v4.6.1 with: - python-version: "3.11" + python-version-file: 'pyproject.toml' architecture: x64 - name: Install dependencies (and project) @@ -27,8 +27,5 @@ jobs: - name: Check ruff run: inv lint-ruff - # Installs and run pyright (node). Easier/faster alt. to: inv check-pyright - - name: Check with pyright - uses: jakebailey/pyright-action@v1.6.0 - with: - version: 1.1.311 + - name: Check pyright + run: inv check-pyright diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py index 87ab761..4ed042c 100755 --- a/dump_channel_to_fs.py +++ b/dump_channel_to_fs.py @@ -62,8 +62,7 @@ def download_if_missing(url, fpath, fsize=None, *, force=False): ) if wget.returncode != 0: logger.error(wget.stdout) - msg = f"wget exited with retcode {wget.returncode}" - raise Exception(msg) + raise Exception(f"wget exited with retcode {wget.returncode}") return not skipped, url, fpath diff --git a/kolibri2zim/database.py b/kolibri2zim/database.py index 5c0f0d2..99b5541 100644 --- a/kolibri2zim/database.py +++ b/kolibri2zim/database.py @@ -36,8 +36,7 @@ def __init__(self, fpath: pathlib.Path, root_id: str | None = None): self.root = self.get_node(root_id) if not self.root: - msg = f"No node for root-id {root_id}" - raise ValueError(msg) + raise ValueError(f"No node for root-id {root_id}") @property def fpath(self): diff --git a/kolibri2zim/processing.py b/kolibri2zim/processing.py index 49a3088..6132850 100644 --- a/kolibri2zim/processing.py +++ b/kolibri2zim/processing.py @@ -22,8 +22,7 @@ def post_process_video(video_dir, video_id, preset, video_format, low_quality): if len(files) == 0: logger.error(f"Video file missing in {video_dir} for {video_id}") logger.debug(list(video_dir.iterdir())) - msg = f"Missing video file in {video_dir}" - raise FileNotFoundError(msg) + raise FileNotFoundError(f"Missing video file in {video_dir}") if len(files) > 1: logger.warning( f"Multiple video file candidates for {video_id} in {video_dir}. " diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index ac9bbc0..bdb6a5d 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -91,8 +91,7 @@ class Kolibri2Zim: def __init__(self, **kwargs): for option in options: if option not in kwargs: - msg = f"Missing parameter `{option}`" - raise ValueError(msg) + raise ValueError(f"Missing parameter `{option}`") def go(option): res = kwargs.get(option) @@ -104,8 +103,7 @@ def go(option): def gom(option): res = go(option) if not res: - msg = f"Unexpected kind of option for {option}" - raise Exception(msg) + raise Exception(f"Unexpected kind of option for {option}") return res self.channel_id = go("channel_id") @@ -758,8 +756,7 @@ def add_html5_node(self, node_id): def run(self): if self.s3_url_with_credentials and not self.s3_credentials_ok(): - msg = "Unable to connect to Optimization Cache. Check its URL." - raise ValueError(msg) + raise ValueError("Unable to connect to Optimization Cache. Check its URL.") s3_msg = ( f" using cache: {self.s3_storage.url.netloc} " @@ -937,8 +934,7 @@ def sanitize_inputs(self): # make sure we were given a filename and not a path fname_path = Path(self.fname.format(period=period)).resolve() if not fname_path.is_file(): - msg = f"filename is not a filename: {self.fname}" - raise ValueError(msg) + raise ValueError(f"filename is not a filename: {self.fname}") self.clean_fname = fname_path.as_posix() else: self.clean_fname = f"{self.name}_{period}.zim" @@ -1028,8 +1024,7 @@ def add_custom_about_and_css(self): soup = BeautifulSoup(fh.read(), "lxml") title = soup.find("title") if not title: - msg = "Failed to extract title" - raise Exception(msg) + raise Exception("Failed to extract title") title = title.text content = soup.select("body > .container") # we're only interested in the first one diff --git a/pyproject.toml b/pyproject.toml index 2f44f8a..6776f2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -143,6 +143,7 @@ select = [ ignore = [ # Allow non-abstract empty methods in abstract base classes "B027", + "EM", # Allow boolean positional values in function calls, like `dict.get(... True)` "FBT003", # Ignore checks for possible passwords From b74cdaf9c1e3b1fcc0848a047d60dc1b3c974a6a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 20 Jul 2023 17:23:23 +0200 Subject: [PATCH 11/45] Use str instead of as_posix --- hatch_build.py | 2 +- kolibri2zim/scraper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hatch_build.py b/hatch_build.py index 66b1f69..d98f9e3 100644 --- a/hatch_build.py +++ b/hatch_build.py @@ -10,7 +10,7 @@ class GetJsDepsHook(BuildHookInterface): def initialize(self, version, build_data): subprocess.run( - Path(self.root).joinpath("get_js_deps.sh").as_posix(), # noqa : S603 + str(Path(self.root).joinpath("get_js_deps.sh")), # noqa : S603 check=True, ) return super().initialize(version, build_data) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index bdb6a5d..4c48923 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -935,7 +935,7 @@ def sanitize_inputs(self): fname_path = Path(self.fname.format(period=period)).resolve() if not fname_path.is_file(): raise ValueError(f"filename is not a filename: {self.fname}") - self.clean_fname = fname_path.as_posix() + self.clean_fname = str(fname_path) else: self.clean_fname = f"{self.name}_{period}.zim" From ae2372c8c5a356ef0b1ed25c1e5c767fc3ab9338 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 10:32:35 +0200 Subject: [PATCH 12/45] Source version from appropriate new file --- kolibri2zim/constants.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kolibri2zim/constants.py b/kolibri2zim/constants.py index 6d7a7bb..f8a27fb 100644 --- a/kolibri2zim/constants.py +++ b/kolibri2zim/constants.py @@ -8,11 +8,12 @@ from zimscraperlib.logging import getLogger as lib_getLogger +from kolibri2zim.__about__ import __version__ + ROOT_DIR = pathlib.Path(__file__).parent NAME = ROOT_DIR.name -with open(ROOT_DIR.joinpath("VERSION")) as fh: - VERSION = fh.read().strip() +VERSION = __version__ SCRAPER = f"{NAME} {VERSION}" From 05fb7a606d28ac123ba7f72e1a52aae9edc06c57 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 10:34:03 +0200 Subject: [PATCH 13/45] Fix Dockerfile for new build system --- Dockerfile | 11 ++++++----- install.sh | 5 +++++ pyproject.toml | 4 ++++ 3 files changed, 15 insertions(+), 5 deletions(-) create mode 100755 install.sh diff --git a/Dockerfile b/Dockerfile index 274c715..89a7302 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,13 +5,14 @@ LABEL org.opencontainers.image.source https://github.com/openzim/kolibri2zim RUN apt-get update -y \ && apt-get install -y --no-install-recommends locales-all unzip ffmpeg \ && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && python -m pip install -U pip hatch -COPY requirements.txt /src/ -RUN pip3 install --no-cache-dir -r /src/requirements.txt +#COPY requirements.txt /src/ +#RUN pip3 install --no-cache-dir -r /src/requirements.txt COPY kolibri2zim /src/kolibri2zim -COPY setup.py *.md get_js_deps.sh MANIFEST.in /src/ -RUN cd /src/ && ./get_js_deps.sh && python3 ./setup.py install +COPY pyproject.toml *.md get_js_deps.sh install.sh MANIFEST.in LICENSE *.py /src/ +RUN cd /src/ && HATCH_BUILD_HOOKS_ENABLE=true hatch build -t sdist && ./install.sh # default output directory RUN mkdir -p /output diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..0b6a66b --- /dev/null +++ b/install.sh @@ -0,0 +1,5 @@ +search_dir=/src/dist/*.tar.gz +for entry in $search_dir +do + pip install "$entry" +done diff --git a/pyproject.toml b/pyproject.toml index 6776f2d..ca1854c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dev = [ "kolibri2zim[scripts]", "kolibri2zim[lint]", "kolibri2zim[check]", + "hatchling", ] [project.urls] @@ -50,6 +51,9 @@ path = "kolibri2zim/__about__.py" exclude = ["/.github"] [tool.hatch.build.hooks.custom] +enable-by-default = false +path = "hatch_build.py" +dependencies = ["zimscraperlib==3.1.0"] [tool.hatch.envs.default] features = ["dev"] From 9d1daaaf9a94d6605bfbbd59094838468a41dd20 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 10:34:34 +0200 Subject: [PATCH 14/45] Remove obsolete file --- requirements.txt | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index fe0d483..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -## Direct dependencies -zimscraperlib==3.1.0 -kiwixstorage==0.8.3 -Jinja2==3.1.2 -pif==0.8.2 -beautifulsoup4==4.9.3 -retrying==1.3.4 From 22c3b88ba3f4861d7ced040ead0e236abe6e615a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 10:35:21 +0200 Subject: [PATCH 15/45] Upgrade Zimscraperlib to 3.1.1 + remove useless code --- kolibri2zim/scraper.py | 16 +++++++--------- pyproject.toml | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 4c48923..6e52913 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -320,7 +320,7 @@ def add_topic_node(self, node_id): self.creator.add_item_for( path=node_id, title=node["title"], - content=html.encode(), + content=html, mimetype="text/html", ) logger.debug(f"Added topic #{node_id}") @@ -441,7 +441,7 @@ def add_video_node(self, node_id): self.creator.add_item_for( path=node_id, title=node["title"], - content=html.encode(), + content=html, mimetype="text/html", ) logger.debug(f"Added video #{node_id}") @@ -544,7 +544,7 @@ def add_audio_node(self, node_id): self.creator.add_item_for( path=node_id, title=node["title"], - content=html.encode(), + content=html, mimetype="text/html", ) logger.debug(f"Added audio #{node_id}") @@ -620,7 +620,7 @@ def add_exercise_node(self, node_id): self.creator.add_item_for( path=node_id, title=node["title"], - content=html.encode(), + content=html, mimetype="text/html", ) logger.debug(f"Added exercise node #{node_id}") @@ -697,7 +697,7 @@ def target_for(file): self.creator.add_item_for( path=path, title=node["title"], - content=html.encode(), + content=html, mimetype="text/html", ) logger.debug(f"Added document #{node_id}") @@ -1041,7 +1041,7 @@ def add_custom_about_and_css(self): self.creator.add_item_for( path="about", title=title, - content=html.encode(), + content=html, mimetype="text/html", ) del html @@ -1060,7 +1060,5 @@ def add_custom_about_and_css(self): else: content = "" - self.creator.add_item_for( - "custom.css", content=content.encode(), mimetype="text/css" - ) + self.creator.add_item_for("custom.css", content=content, mimetype="text/css") logger.debug("Added about page and custom CSS") diff --git a/pyproject.toml b/pyproject.toml index ca1854c..ebb50c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", ] dependencies = [ - "zimscraperlib==3.1.0", + "zimscraperlib==3.1.1", "kiwixstorage==0.8.3", "Jinja2==3.1.2", "pif==0.8.2", @@ -53,7 +53,7 @@ exclude = ["/.github"] [tool.hatch.build.hooks.custom] enable-by-default = false path = "hatch_build.py" -dependencies = ["zimscraperlib==3.1.0"] +dependencies = ["zimscraperlib==3.1.1"] [tool.hatch.envs.default] features = ["dev"] From fc7c08583950928ea8423cf8f565b6aaa27c79ca Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 10:36:02 +0200 Subject: [PATCH 16/45] Fix computation of nb_threads and nb_processes --- kolibri2zim/scraper.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 6e52913..181f60c 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -100,12 +100,6 @@ def go(option): else: return None - def gom(option): - res = go(option) - if not res: - raise Exception(f"Unexpected kind of option for {option}") - return res - self.channel_id = go("channel_id") self.root_id = go("root_id") @@ -139,8 +133,10 @@ def gom(option): self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir"))) # performances options - self.nb_threads = int(gom("threads")) - self.nb_processes = int(gom("processes")) + nb_threads_str = go("threads") + self.nb_threads = int(nb_threads_str) if nb_threads_str else None + nb_processes_str = go("processes") + self.nb_processes = int(nb_processes_str) if nb_processes_str else None self.s3_url_with_credentials = go("s3_url_with_credentials") self.s3_storage = None self.dedup_html_files = go("dedup_html_files") From e35f35e04d633223d60a13e847adfb96f5baad47 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 10:36:36 +0200 Subject: [PATCH 17/45] Fail early if it looks like JS dependencies are not available --- kolibri2zim/scraper.py | 68 +++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 181f60c..4731f36 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -7,6 +7,7 @@ import hashlib import io import json +import os import shutil import tempfile import threading @@ -751,24 +752,7 @@ def add_html5_node(self, node_id): logger.debug(f"Added HTML5 node #{node_id}") def run(self): - if self.s3_url_with_credentials and not self.s3_credentials_ok(): - raise ValueError("Unable to connect to Optimization Cache. Check its URL.") - - s3_msg = ( - f" using cache: {self.s3_storage.url.netloc} " - f"with bucket: {self.s3_storage.bucket_name}" - if self.s3_storage - else "" - ) - logger.info( - f"Starting scraper with:\n" - f" channel_id: {self.channel_id}\n" - f" build_dir: {self.build_dir}\n" - f" output_dir: {self.output_dir}\n" - f" using webm : {self.use_webm}\n" - f" low_quality : {self.low_quality}\n" - f"{s3_msg}" - ) + self.ensure_js_deps_are_present() logger.info("Download database") self.download_db() @@ -1058,3 +1042,51 @@ def add_custom_about_and_css(self): self.creator.add_item_for("custom.css", content=content, mimetype="text/css") logger.debug("Added about page and custom CSS") + + def ensure_js_deps_are_present(self): + for js_deps_file in [ + "epub.min.js", + "jszip.min.js", + "jquery.min.js", + "videojs-ogvjs.js", + ]: + if not os.path.exists( + self.templates_dir.joinpath(f"assets/{js_deps_file}") + ): + raise ValueError( + "It looks like JS deps have not been installed," + f" {js_deps_file} is missing" + ) + + for js_deps_dir in [ + "pdfjs", + "videojs", + "ogvjs", + "bootstrap", + "bootstrap-icons", + "perseus", + ]: + if not os.path.exists(self.templates_dir.joinpath(f"assets/{js_deps_dir}")): + raise ValueError( + "It looks like JS deps have not been installed," + f" {js_deps_dir} is missing" + ) + + if self.s3_url_with_credentials and not self.s3_credentials_ok(): + raise ValueError("Unable to connect to Optimization Cache. Check its URL.") + + s3_msg = ( + f" using cache: {self.s3_storage.url.netloc} " + f"with bucket: {self.s3_storage.bucket_name}" + if self.s3_storage + else "" + ) + logger.info( + f"Starting scraper with:\n" + f" channel_id: {self.channel_id}\n" + f" build_dir: {self.build_dir}\n" + f" output_dir: {self.output_dir}\n" + f" using webm : {self.use_webm}\n" + f" low_quality : {self.low_quality}\n" + f"{s3_msg}" + ) From 20d6102201498bc2e0a95f177b32c257b21aaa2b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 10:56:47 +0200 Subject: [PATCH 18/45] Revert useless modifications --- kolibri2zim/database.py | 11 +---------- kolibri2zim/scraper.py | 6 +----- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/kolibri2zim/database.py b/kolibri2zim/database.py index 99b5541..7ecfa6e 100644 --- a/kolibri2zim/database.py +++ b/kolibri2zim/database.py @@ -208,16 +208,7 @@ def get_node_files(self, node_id, *, thumbnail=False): "ORDER BY priority ASC", (node_id, 1, 1 if thumbnail else 0), ): - yield { - "id": row["id"], - "fid": row["fid"], - "ext": row["ext"], - "prio": row["prio"], - "supp": row["supp"], - "checksum": row["checksum"], - "lang": row["lang"], - "preset": row["preset"], - } + yield dict(row) def get_node_thumbnail(self, node_id): return self.get_node_file(node_id, thumbnail=True) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 4731f36..bacddce 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -95,11 +95,7 @@ def __init__(self, **kwargs): raise ValueError(f"Missing parameter `{option}`") def go(option): - res = kwargs.get(option) - if type(res) is str: - return res - else: - return None + return kwargs.get(option) self.channel_id = go("channel_id") self.root_id = go("root_id") From 41ccf9388c8c65d336149a29e1cecc96baf44abb Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 11:06:54 +0200 Subject: [PATCH 19/45] Rewrite tmp_dir usage to avoid many calls to go() function --- dump_channel_to_fs.py | 24 ++++------------ kolibri2zim/scraper.py | 64 +++++++++++++++++++++--------------------- 2 files changed, 37 insertions(+), 51 deletions(-) diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py index 4ed042c..ad32cc0 100755 --- a/dump_channel_to_fs.py +++ b/dump_channel_to_fs.py @@ -144,27 +144,13 @@ def on_success(result): logger.info("Done downloading files") -CHANNEL_ID_POS_IN_ARGV = 2 -BUILD_DIR_POS_IN_ARGV = 3 -FORCE_POS_IN_ARGV = 4 - if __name__ == "__main__": - if len(sys.argv) < CHANNEL_ID_POS_IN_ARGV: + args = [sys.argv[idx] if len(sys.argv) >= idx + 1 else None for idx in range(4)] + _, channel_id, build_dir, force = args + + if not channel_id: logger.error("Missing channel ID") sys.exit(1) - - channel_id = sys.argv[1] - if len(sys.argv) >= BUILD_DIR_POS_IN_ARGV: - build_dir = sys.argv[2] - else: - build_dir = None - if len(sys.argv) >= FORCE_POS_IN_ARGV: - force = ( - sys.argv[3].lower() == "true" - or sys.argv[3].lower() == "force" - or sys.argv[3].lower() == "yes" - ) - else: - force = False + force = bool(str(force).lower() in ("true", "force", "yes")) dump(channel_id=channel_id, build_dir=build_dir, force=force) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index bacddce..b820982 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -13,7 +13,6 @@ import threading import zipfile from pathlib import Path -from typing import Any import jinja2 from bs4 import BeautifulSoup @@ -125,9 +124,10 @@ def go(option): # directory setup self.output_dir = Path(str(go("output_dir"))).expanduser().resolve() - if go("tmp_dir"): - Path(str(go("tmp_dir"))).mkdir(parents=True, exist_ok=True) - self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir"))) + tmp_dir = go("tmp_dir") + if tmp_dir: + Path(tmp_dir).mkdir(parents=True, exist_ok=True) + self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir)) # performances options nb_threads_str = go("threads") @@ -273,11 +273,12 @@ def funnel_from_s3(self, file_id, path, checksum, preset): # add to zim with self.creator_lock: - self.creator.add_item_for( - path=path, - content=fileobj.read(), - mimetype=preset.mimetype, - ) + kwargs = { + "path": path, + "fileobj": fileobj, + "mimetype": preset.mimetype, + } + self.creator.add_item_for(StaticItem(**kwargs)) logger.debug(f"Added {path} from S3::{key}") return True @@ -326,23 +327,23 @@ def add_video_node(self, node_id): subtitle files (`video_subtitle`) are VTT files and are only limited by the number of language to select from in kolibri studio""" - files = list(self.db.get_node_files(node_id, thumbnail=False)) - if len(files) == 0: + files = self.db.get_node_files(node_id, thumbnail=False) + if not files: return files = sorted(files, key=lambda f: f["prio"]) - it: list[dict[str, Any]] = list(filter(lambda f: f["supp"] == 0, files)) - - if len(it) == 0: + it = filter(lambda f: f["supp"] == 0, files) + try: + # find main video file + video_file = next(it) + except StopIteration: # we have no video file return - elif len(it) == 1: + + try: + alt_video_file = next(it) + except StopIteration: # we have no supplementary video file (which is OK) - video_file = it[0] alt_video_file = None - else: - # we have video and alt video - video_file = it[0] - alt_video_file = it[1] # now decide which file to keep and what to do with it @@ -445,10 +446,10 @@ def add_video_upon_completion(self, future): logs error in case of failure""" if future.cancelled(): return - res = self.videos_futures.get(future) - if not res: + try: + src_fname, dst_fpath, path = self.videos_futures[future] + except KeyError: return - src_fname, dst_fpath, path = res try: future.result() @@ -507,10 +508,10 @@ def request_s3_upload_and_removal(self, item): """add file from item to uploads list""" path = item.path del item - res = self.pending_upload.get(path) - if not res: + try: + dest_fpath, key, meta = self.pending_upload[path] + except KeyError: return - dest_fpath, key, meta = res # TODO: submit to a thread executor (to create) instead # this is currently called on main-tread. self.upload_to_s3(key, dest_fpath, **meta) @@ -908,17 +909,16 @@ def sanitize_inputs(self): period = datetime.datetime.now(datetime.UTC).strftime("%Y-%m") if self.fname: # make sure we were given a filename and not a path - fname_path = Path(self.fname.format(period=period)).resolve() - if not fname_path.is_file(): - raise ValueError(f"filename is not a filename: {self.fname}") + fname_path = Path(str(self.fname).format(period=period)) + if Path(fname_path.name) != fname_path: + raise ValueError(f"filename is not a filename: {fname_path}") self.clean_fname = str(fname_path) else: self.clean_fname = f"{self.name}_{period}.zim" if not self.title: - self.clean_title = channel_meta["name"].strip() - else: - self.clean_title = self.title.strip() + self.title = channel_meta["name"] + self.title = self.title.strip() if not self.description: self.description = channel_meta["description"] From a233ee85a7fa50a6eb98e6237bc046f0224365d4 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 14:21:26 +0200 Subject: [PATCH 20/45] Only one read_from_zip makes more sense --- kolibri2zim/scraper.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index b820982..cfcec14 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -79,14 +79,10 @@ def get_kolibri_url_for(file_id: str, ext: str): return f"{STUDIO_URL}/content/storage/{remote_path}", fname -def read_from_zip_as_bytes(ark, member): +def read_from_zip(ark, member): return ark.open(member).read() -def read_from_zip_as_text(ark, member): - return read_from_zip_as_bytes(ark, member).decode("utf-8") - - class Kolibri2Zim: def __init__(self, **kwargs): for option in options: @@ -571,7 +567,7 @@ def add_exercise_node(self, node_id): if manifest_name not in zip_ark.namelist(): logger.error(f"Excercise node without {manifest_name}") return - manifest = json.loads(read_from_zip_as_bytes(zip_ark, manifest_name)) + manifest = json.loads(read_from_zip(zip_ark, manifest_name)) # copy exercise content, rewriting internal paths # all internal resources to be stored under {node_id}/ prefix @@ -579,7 +575,7 @@ def add_exercise_node(self, node_id): for assessment_item in manifest.get("all_assessment_items", []): item_path = f"{assessment_item}.json" if item_path in zip_ark.namelist(): - perseus_content = read_from_zip_as_text(zip_ark, item_path) + perseus_content = read_from_zip(zip_ark, item_path).decode("utf-8") perseus_content = perseus_content.replace( r"web+graphie:${☣ LOCALPATH}", f"web+graphie:./{node_id}" ) @@ -598,7 +594,7 @@ def add_exercise_node(self, node_id): self.creator.add_item_for( path=path, title="", - content=read_from_zip_as_bytes(zip_ark, ark_member), + content=read_from_zip(zip_ark, ark_member), ) logger.debug(f"Added exercise support file {path}") From 7d408ddcff6d527fb636b44f12b248e933a3ef17 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 14:21:46 +0200 Subject: [PATCH 21/45] Read directly from the files --- kolibri2zim/scraper.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index cfcec14..af2b073 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -992,16 +992,15 @@ def add_custom_about_and_css(self): title = channel_meta["name"] content = None else: - with open(user_provided_file) as fh: - soup = BeautifulSoup(fh.read(), "lxml") - title = soup.find("title") - if not title: - raise Exception("Failed to extract title") - title = title.text - content = soup.select("body > .container") - # we're only interested in the first one - if isinstance(content, list): - content = content[0] + soup = BeautifulSoup(user_provided_file.read_bytes(), "lxml") + title = soup.find("title") + if not title: + raise Exception("Failed to extract title") + title = title.text + content = soup.select("body > .container") + # we're only interested in the first one + if isinstance(content, list): + content = content[0] else: title = channel_meta["name"] content = None @@ -1026,8 +1025,7 @@ def add_custom_about_and_css(self): if not user_provided_file: content = "" else: - with open(user_provided_file) as fh: - content = fh.read() + content = user_provided_file.read_bytes() # otherwise, create a blank one else: content = "" From fb341f70b36a8d1a77779930c937a3bfdfa4166c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 14:30:10 +0200 Subject: [PATCH 22/45] Use major Github actions versions --- .github/workflows/publish.yml | 8 ++++---- .github/workflows/qa.yml | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index ef88218..432ffc8 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -11,12 +11,12 @@ jobs: id-token: write # mandatory for PyPI trusted publishing steps: - - uses: actions/checkout@v3.5.3 + - uses: actions/checkout@v3 - name: Set up Python 3.11 - uses: actions/setup-python@v4.6.1 + uses: actions/setup-python@v4 with: - python-version-file: "pyproject.toml" + python-version-file: pyproject.toml architecture: x64 - name: Build packages @@ -25,7 +25,7 @@ jobs: python -m build sdist wheel - name: Upload to PyPI - uses: pypa/gh-action-pypi-publish@v1.8.6 + uses: pypa/gh-action-pypi-publish@release/v1.8 - name: Build and push Docker image uses: openzim/docker-publish-action@v10 diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml index abf892b..ec036c0 100644 --- a/.github/workflows/qa.yml +++ b/.github/workflows/qa.yml @@ -8,12 +8,12 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3.5.3 + - uses: actions/checkout@v3 - name: Set up Python 3.11 - uses: actions/setup-python@v4.6.1 + uses: actions/setup-python@v4 with: - python-version-file: 'pyproject.toml' + python-version-file: pyproject.toml architecture: x64 - name: Install dependencies (and project) From dae158fce6dcc0155d99585edb398224bd5985ba Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 14:36:41 +0200 Subject: [PATCH 23/45] Add missing 'check' dependency to CI --- .github/workflows/qa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml index ec036c0..07978d3 100644 --- a/.github/workflows/qa.yml +++ b/.github/workflows/qa.yml @@ -19,7 +19,7 @@ jobs: - name: Install dependencies (and project) run: | pip install -U pip - pip install -e .[lint,scripts,test] + pip install -e .[lint,check,scripts,test] - name: Check black formatting run: inv lint-black From 7c83e231c67fe9d788887e10ae9ebffaaabd81a1 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 14:37:52 +0200 Subject: [PATCH 24/45] Fix missleading names in CI --- .github/workflows/publish.yml | 2 +- .github/workflows/qa.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 432ffc8..0f96987 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -13,7 +13,7 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python 3.11 + - name: Set up Python uses: actions/setup-python@v4 with: python-version-file: pyproject.toml diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml index 07978d3..54c93eb 100644 --- a/.github/workflows/qa.yml +++ b/.github/workflows/qa.yml @@ -10,7 +10,7 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python 3.11 + - name: Set up Python uses: actions/setup-python@v4 with: python-version-file: pyproject.toml From 6552dba1e5bc0ac92b34403ba1abab4ee13f7737 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 14:40:32 +0200 Subject: [PATCH 25/45] Fix publishing CI --- .github/workflows/publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 0f96987..7fdc962 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,8 +21,8 @@ jobs: - name: Build packages run: | - pip install -U pip build - python -m build sdist wheel + pip install -U pip hatch + HATCH_BUILD_HOOKS_ENABLE=true hatch build - name: Upload to PyPI uses: pypa/gh-action-pypi-publish@release/v1.8 From 2d618509e0a28cab461ecda24283c6d87badd977 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 15:01:09 +0200 Subject: [PATCH 26/45] Fix favicon / illustrations handling to not downscale then upscale --- kolibri2zim/scraper.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index af2b073..54bc892 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -797,7 +797,7 @@ def run(self): Creator=self.author, Publisher=self.publisher, Date=datetime.datetime.now(datetime.UTC).strftime("%Y-%d-%m"), - Illustration_48x48_at_1=self.favicon_fpath.read_bytes(), + Illustration_48x48_at_1=self.favicon_48_fpath.read_bytes(), ) self.creator.start() @@ -957,27 +957,23 @@ def retrieve_favicon(self): ) # convert to PNG (might already be PNG but it's OK) - favicon_fpath = favicon_orig.with_suffix(".png") - convert_image(favicon_orig, favicon_fpath) + self.favicon_48_fpath = favicon_orig.with_suffix(".48.png") + convert_image(favicon_orig, self.favicon_48_fpath) - # resize to appropriate size (ZIM uses 48x48 so we double for retina) - for size in (96, 48): - resize_image(favicon_fpath, width=size, height=size, method="thumbnail") - with open(favicon_fpath, "rb") as fh: - self.creator.add_illustration(size, fh.read()) + self.favicon_96_fpath = favicon_orig.with_suffix(".96.png") + convert_image(favicon_orig, self.favicon_96_fpath) - # resize to appropriate size (ZIM uses 48x48) - resize_image(favicon_fpath, width=96, height=96, method="thumbnail") + # resize to appropriate size (ZIM uses 48x48 so we double for retina) + resize_image(self.favicon_48_fpath, width=48, height=48, method="thumbnail") + resize_image(self.favicon_96_fpath, width=96, height=96, method="thumbnail") # generate favicon - favicon_ico_path = favicon_fpath.with_suffix(".ico") - create_favicon(src=favicon_fpath, dst=favicon_ico_path) - - self.favicon_fpath = favicon_fpath - self.favicon_ico_path = favicon_ico_path + self.favicon_ico_path = favicon_orig.with_suffix(".ico") + create_favicon(src=self.favicon_96_fpath, dst=self.favicon_ico_path) def add_favicon(self): - self.creator.add_item_for("favicon.png", fpath=self.favicon_fpath) + self.creator.add_illustration(96, self.favicon_96_fpath.read_bytes()) + self.creator.add_item_for("favicon.png", fpath=self.favicon_96_fpath) self.creator.add_item_for("favicon.ico", fpath=self.favicon_ico_path) def add_custom_about_and_css(self): From f48532b349506e3956ad98a3d40a7719511c975c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 15:10:02 +0200 Subject: [PATCH 27/45] Truncate description and add truncated long description --- kolibri2zim/scraper.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 54bc892..3037e77 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -18,6 +18,10 @@ from bs4 import BeautifulSoup from kiwixstorage import KiwixStorage from pif import get_public_ip +from zimscraperlib.constants import ( + MAXIMUM_DESCRIPTION_METADATA_LENGTH, + MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH, +) from zimscraperlib.filesystem import get_file_mimetype from zimscraperlib.i18n import find_language_names from zimscraperlib.image.convertion import convert_image, create_favicon @@ -793,7 +797,16 @@ def run(self): Name=self.clean_fname, Language="eng", Title=self.title, - Description=self.description, + Description=( + f"{self.description[0:MAXIMUM_DESCRIPTION_METADATA_LENGTH-4]} ..." + if len(self.description) > MAXIMUM_DESCRIPTION_METADATA_LENGTH + else self.description + ), + LongDescription=( + f"{self.description[0:MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH-4]} ..." + if len(self.description) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH + else self.description + ), Creator=self.author, Publisher=self.publisher, Date=datetime.datetime.now(datetime.UTC).strftime("%Y-%d-%m"), From c28dc1254d60f2a8ed372d840bc9f0486e645d01 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 15:10:41 +0200 Subject: [PATCH 28/45] zimscraperlib now supports datetime + this avoids formating issue --- kolibri2zim/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 3037e77..37bb764 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -809,7 +809,7 @@ def run(self): ), Creator=self.author, Publisher=self.publisher, - Date=datetime.datetime.now(datetime.UTC).strftime("%Y-%d-%m"), + Date=datetime.datetime.now(datetime.UTC), Illustration_48x48_at_1=self.favicon_48_fpath.read_bytes(), ) self.creator.start() From d755683781e36862c55ca31a98a9fa53511b2e6e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 15:28:24 +0200 Subject: [PATCH 29/45] Images must be squared --- kolibri2zim/scraper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 37bb764..6b8d54d 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -977,8 +977,8 @@ def retrieve_favicon(self): convert_image(favicon_orig, self.favicon_96_fpath) # resize to appropriate size (ZIM uses 48x48 so we double for retina) - resize_image(self.favicon_48_fpath, width=48, height=48, method="thumbnail") - resize_image(self.favicon_96_fpath, width=96, height=96, method="thumbnail") + resize_image(self.favicon_48_fpath, width=48, height=48, method="contain") + resize_image(self.favicon_96_fpath, width=96, height=96, method="contain") # generate favicon self.favicon_ico_path = favicon_orig.with_suffix(".ico") From 977aaffcbd00623ac8c835f47961039e442b8ce6 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 21 Jul 2023 15:28:51 +0200 Subject: [PATCH 30/45] Fix changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfc2123..adbabde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,12 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fix issue with ePub rendering which was outside the iframe +- Description is now limited to expected lenght and long description is set +- Icons and illustrations are squared as expected - Many small fixes (including some bugs) detected by ruff / pyright ### Changed - Migrate to our new Python standard (hatch, ruff, pyright, ...) -- Using zimscraperlib 3.1.0 +- Using zimscraperlib 3.1.1 - Updated image to `python:3.11-bullseye` - Retry video reencoding up to three times - Move inline javascript to dedicated files From 7186e5f44e4d1d2a54db1df15aa002ee0dce5f54 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 13:54:19 +0200 Subject: [PATCH 31/45] Enhance ZIM description handling and add support long description --- kolibri2zim/entrypoint.py | 5 ++++ kolibri2zim/scraper.py | 51 ++++++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/kolibri2zim/entrypoint.py b/kolibri2zim/entrypoint.py index 98576e8..856bea2 100755 --- a/kolibri2zim/entrypoint.py +++ b/kolibri2zim/entrypoint.py @@ -47,6 +47,11 @@ def main(): help="Custom description for your ZIM. Kolibri channel description otherwise", ) + parser.add_argument( + "--long-description", + help="Custom long description for your ZIM, optional", + ) + parser.add_argument( "--favicon", help="URL/path for Favicon. Kolibri channel thumbnail otherwise " diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 6b8d54d..b1fb7fa 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -19,8 +19,10 @@ from kiwixstorage import KiwixStorage from pif import get_public_ip from zimscraperlib.constants import ( - MAXIMUM_DESCRIPTION_METADATA_LENGTH, - MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH, + MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH, +) +from zimscraperlib.constants import ( + MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH, ) from zimscraperlib.filesystem import get_file_mimetype from zimscraperlib.i18n import find_language_names @@ -50,6 +52,7 @@ "fname", "title", "description", + "long_description", "creator", "publisher", "tags", @@ -113,6 +116,7 @@ def go(option): self.tags = [t.strip() for t in tags.split(",")] self.title = go("title") self.description = go("description") + self.long_description = go("long_description") self.author = go("creator") self.publisher = go("publisher") self.name = go("name") @@ -797,16 +801,8 @@ def run(self): Name=self.clean_fname, Language="eng", Title=self.title, - Description=( - f"{self.description[0:MAXIMUM_DESCRIPTION_METADATA_LENGTH-4]} ..." - if len(self.description) > MAXIMUM_DESCRIPTION_METADATA_LENGTH - else self.description - ), - LongDescription=( - f"{self.description[0:MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH-4]} ..." - if len(self.description) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH - else self.description - ), + Description=self.description, + LongDescription=self.long_description, Creator=self.author, Publisher=self.publisher, Date=datetime.datetime.now(datetime.UTC), @@ -930,8 +926,35 @@ def sanitize_inputs(self): self.title = self.title.strip() if not self.description: - self.description = channel_meta["description"] - self.description = self.description.strip() + # User did not provided a description, we will infer it from channel + # metadata, limited to maximum length + if self.long_description: + raise ValueError( + "long_description cannot be set if description is not set" + ) + self.description = channel_meta["description"].strip() + if len(self.description) > MAX_DESC_LENGTH: + self.long_description = self.description + self.description = f"{self.description[0:MAX_DESC_LENGTH-1]}…" + if len(self.long_description > MAX_LONG_DESC_LENGTH): + self.long_description = ( + f"{self.long_description[0:MAX_LONG_DESC_LENGTH-1]}…" + ) + else: + self.description = self.description.strip() + if len(self.description) > MAX_DESC_LENGTH: + raise ValueError( + f"description is too long ({len(self.description)}" + f">{MAX_DESC_LENGTH})" + ) + if ( + self.long_description + and len(self.long_description) > MAX_LONG_DESC_LENGTH + ): + raise ValueError( + f"long_description is too long ({len(self.long_description)}" + f">{MAX_LONG_DESC_LENGTH})" + ) if not self.author: self.author = channel_meta["author"] or "Kolibri" From 2000d7e6c86e0159ad2b7410ed64f847a3df78ea Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 15:24:04 +0200 Subject: [PATCH 32/45] Add tests for description/long-description computations (including folder layout changes to accomodate for tests) --- .github/workflows/Tests.yml | 38 ++++ .gitignore | 20 +- get_js_deps.sh | 2 +- .../templates/assets/perseus_exercise.js | 1 - pyproject.toml | 49 ++++- {kolibri2zim => src/kolibri2zim}/__about__.py | 0 {kolibri2zim => src/kolibri2zim}/__init__.py | 0 {kolibri2zim => src/kolibri2zim}/__main__.py | 0 {kolibri2zim => src/kolibri2zim}/constants.py | 0 {kolibri2zim => src/kolibri2zim}/database.py | 0 {kolibri2zim => src/kolibri2zim}/debug.py | 0 .../kolibri2zim}/entrypoint.py | 0 {kolibri2zim => src/kolibri2zim}/nodes.py | 0 .../kolibri2zim}/processing.py | 0 {kolibri2zim => src/kolibri2zim}/scraper.py | 2 +- .../kolibri2zim}/templates/about.html | 0 .../kolibri2zim}/templates/assets/document.js | 2 +- .../templates/assets/epub_embed.css | 2 +- .../templates/assets/epub_embed.html | 0 .../templates/assets/epub_embed.js | 2 +- .../templates/assets/perseus_exercise.js | 1 + .../kolibri2zim}/templates/audio.html | 0 .../kolibri2zim}/templates/base.html | 0 .../kolibri2zim}/templates/card.html | 0 .../kolibri2zim}/templates/document.html | 2 +- .../kolibri2zim}/templates/epub.html | 2 - .../kolibri2zim}/templates/kolibri-logo.png | Bin .../kolibri2zim}/templates/node_meta.html | 0 .../templates/perseus_exercise.html | 0 .../kolibri2zim}/templates/topic.html | 0 .../kolibri2zim}/templates/video.html | 0 tasks.py | 26 +++ tests/conftest.py | 60 ++++++ tests/test_sanitize_inputs.py | 193 ++++++++++++++++++ 34 files changed, 379 insertions(+), 23 deletions(-) create mode 100644 .github/workflows/Tests.yml delete mode 100644 kolibri2zim/templates/assets/perseus_exercise.js rename {kolibri2zim => src/kolibri2zim}/__about__.py (100%) rename {kolibri2zim => src/kolibri2zim}/__init__.py (100%) rename {kolibri2zim => src/kolibri2zim}/__main__.py (100%) rename {kolibri2zim => src/kolibri2zim}/constants.py (100%) rename {kolibri2zim => src/kolibri2zim}/database.py (100%) rename {kolibri2zim => src/kolibri2zim}/debug.py (100%) rename {kolibri2zim => src/kolibri2zim}/entrypoint.py (100%) rename {kolibri2zim => src/kolibri2zim}/nodes.py (100%) rename {kolibri2zim => src/kolibri2zim}/processing.py (100%) rename {kolibri2zim => src/kolibri2zim}/scraper.py (99%) rename {kolibri2zim => src/kolibri2zim}/templates/about.html (100%) rename {kolibri2zim => src/kolibri2zim}/templates/assets/document.js (94%) rename {kolibri2zim => src/kolibri2zim}/templates/assets/epub_embed.css (99%) rename {kolibri2zim => src/kolibri2zim}/templates/assets/epub_embed.html (100%) rename {kolibri2zim => src/kolibri2zim}/templates/assets/epub_embed.js (94%) create mode 100644 src/kolibri2zim/templates/assets/perseus_exercise.js rename {kolibri2zim => src/kolibri2zim}/templates/audio.html (100%) rename {kolibri2zim => src/kolibri2zim}/templates/base.html (100%) rename {kolibri2zim => src/kolibri2zim}/templates/card.html (100%) rename {kolibri2zim => src/kolibri2zim}/templates/document.html (99%) rename {kolibri2zim => src/kolibri2zim}/templates/epub.html (99%) rename {kolibri2zim => src/kolibri2zim}/templates/kolibri-logo.png (100%) rename {kolibri2zim => src/kolibri2zim}/templates/node_meta.html (100%) rename {kolibri2zim => src/kolibri2zim}/templates/perseus_exercise.html (100%) rename {kolibri2zim => src/kolibri2zim}/templates/topic.html (100%) rename {kolibri2zim => src/kolibri2zim}/templates/video.html (100%) create mode 100644 tests/conftest.py create mode 100644 tests/test_sanitize_inputs.py diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml new file mode 100644 index 0000000..53ace14 --- /dev/null +++ b/.github/workflows/Tests.yml @@ -0,0 +1,38 @@ +name: Tests + +on: + pull_request: + push: + branches: + - main + +jobs: + run-tests: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[test,scripts] + + - name: Run the tests + run: inv coverage --args "-vvv" + + - name: Upload coverage report to codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + + - name: Ensure we can build targets + run: | + pip install build + python3 -m build diff --git a/.gitignore b/.gitignore index d4c96ac..db971bd 100644 --- a/.gitignore +++ b/.gitignore @@ -367,17 +367,17 @@ pyrightconfig.json # assets that we download -kolibri2zim/templates/assets/bootstrap/ -kolibri2zim/templates/assets/pdfjs/ -kolibri2zim/templates/assets/videojs/ -kolibri2zim/templates/assets/jquery.min.js -kolibri2zim/templates/assets/ogvjs/ -kolibri2zim/templates/assets/videojs-ogvjs.js .dockerignore -kolibri2zim/templates/assets/epub.min.js -kolibri2zim/templates/assets/bootstrap-icons/ -kolibri2zim/templates/assets/jszip.min.js -kolibri2zim/templates/assets/perseus/ +src/kolibri2zim/templates/assets/bootstrap/ +src/kolibri2zim/templates/assets/pdfjs/ +src/kolibri2zim/templates/assets/videojs/ +src/kolibri2zim/templates/assets/jquery.min.js +src/kolibri2zim/templates/assets/ogvjs/ +src/kolibri2zim/templates/assets/videojs-ogvjs.js +src/kolibri2zim/templates/assets/epub.min.js +src/kolibri2zim/templates/assets/bootstrap-icons/ +src/kolibri2zim/templates/assets/jszip.min.js +src/kolibri2zim/templates/assets/perseus/ # output dir output diff --git a/get_js_deps.sh b/get_js_deps.sh index 9936269..8954e5b 100755 --- a/get_js_deps.sh +++ b/get_js_deps.sh @@ -23,7 +23,7 @@ fi # Absolute path this script is in. SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )" -ASSETS_PATH="${SCRIPT_PATH}/kolibri2zim/templates/assets" +ASSETS_PATH="${SCRIPT_PATH}/src/kolibri2zim/templates/assets" echo "About to download JS assets to ${ASSETS_PATH}" diff --git a/kolibri2zim/templates/assets/perseus_exercise.js b/kolibri2zim/templates/assets/perseus_exercise.js deleted file mode 100644 index dfbc7a7..0000000 --- a/kolibri2zim/templates/assets/perseus_exercise.js +++ /dev/null @@ -1 +0,0 @@ -less = { env: 'development', logLevel: 1 }; \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ebb50c0..6bd8c4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,10 +29,15 @@ dynamic = ["version"] scripts = ["invoke==2.1.3"] lint = ["black==23.3.0", "ruff==0.0.272"] check = ["pyright==1.1.317"] +test = [ + "pytest==7.4.0", + "coverage==7.2.7", +] dev = [ "debugpy", "kolibri2zim[scripts]", "kolibri2zim[lint]", + "kolibri2zim[test]", "kolibri2zim[check]", "hatchling", ] @@ -45,19 +50,27 @@ Donate = "https://www.kiwix.org/en/support-us/" kolibri2zim = "kolibri2zim:entrypoint.main" [tool.hatch.version] -path = "kolibri2zim/__about__.py" +path = "src/kolibri2zim/__about__.py" [tool.hatch.build] exclude = ["/.github"] [tool.hatch.build.hooks.custom] -enable-by-default = false path = "hatch_build.py" dependencies = ["zimscraperlib==3.1.1"] [tool.hatch.envs.default] features = ["dev"] +[tool.hatch.envs.test] +features = ["scripts", "test"] + +[tool.hatch.envs.test.scripts] +run = "inv test --args '{args}'" +run-cov = "inv test-cov --args '{args}'" +report-cov = "inv report-cov" +coverage = "inv coverage --args '{args}'" + [tool.hatch.envs.lint] template = "lint" python = "py311" @@ -176,13 +189,41 @@ ban-relative-imports = "all" # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] +[tool.pytest.ini_options] +minversion = "7.3" +testpaths = ["tests"] +pythonpath = [".", "src"] + +[tool.coverage.paths] +great_project = ["src/kolibri2zim"] +tests = ["tests"] + +[tool.coverage.run] +source_pkgs = ["kolibri2zim"] +branch = true +parallel = true +omit = [ + "src/kolibri2zim/__about__.py", +] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + [tool.pyright] pythonVersion = "3.11" pythonPlatform = "All" typeCheckingMode = "basic" -include = ["kolibri2zim"] +include = ["src", "tests", "tasks.py"] exclude = ["**/node_modules", "**/__pycache__", - "kolibri2zim/templates", + "src/kolibri2zim/templates", +] + +executionEnvironments= [ + { root= "src" } ] diff --git a/kolibri2zim/__about__.py b/src/kolibri2zim/__about__.py similarity index 100% rename from kolibri2zim/__about__.py rename to src/kolibri2zim/__about__.py diff --git a/kolibri2zim/__init__.py b/src/kolibri2zim/__init__.py similarity index 100% rename from kolibri2zim/__init__.py rename to src/kolibri2zim/__init__.py diff --git a/kolibri2zim/__main__.py b/src/kolibri2zim/__main__.py similarity index 100% rename from kolibri2zim/__main__.py rename to src/kolibri2zim/__main__.py diff --git a/kolibri2zim/constants.py b/src/kolibri2zim/constants.py similarity index 100% rename from kolibri2zim/constants.py rename to src/kolibri2zim/constants.py diff --git a/kolibri2zim/database.py b/src/kolibri2zim/database.py similarity index 100% rename from kolibri2zim/database.py rename to src/kolibri2zim/database.py diff --git a/kolibri2zim/debug.py b/src/kolibri2zim/debug.py similarity index 100% rename from kolibri2zim/debug.py rename to src/kolibri2zim/debug.py diff --git a/kolibri2zim/entrypoint.py b/src/kolibri2zim/entrypoint.py similarity index 100% rename from kolibri2zim/entrypoint.py rename to src/kolibri2zim/entrypoint.py diff --git a/kolibri2zim/nodes.py b/src/kolibri2zim/nodes.py similarity index 100% rename from kolibri2zim/nodes.py rename to src/kolibri2zim/nodes.py diff --git a/kolibri2zim/processing.py b/src/kolibri2zim/processing.py similarity index 100% rename from kolibri2zim/processing.py rename to src/kolibri2zim/processing.py diff --git a/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py similarity index 99% rename from kolibri2zim/scraper.py rename to src/kolibri2zim/scraper.py index b1fb7fa..1bf0fcb 100644 --- a/kolibri2zim/scraper.py +++ b/src/kolibri2zim/scraper.py @@ -936,7 +936,7 @@ def sanitize_inputs(self): if len(self.description) > MAX_DESC_LENGTH: self.long_description = self.description self.description = f"{self.description[0:MAX_DESC_LENGTH-1]}…" - if len(self.long_description > MAX_LONG_DESC_LENGTH): + if len(self.long_description) > MAX_LONG_DESC_LENGTH: self.long_description = ( f"{self.long_description[0:MAX_LONG_DESC_LENGTH-1]}…" ) diff --git a/kolibri2zim/templates/about.html b/src/kolibri2zim/templates/about.html similarity index 100% rename from kolibri2zim/templates/about.html rename to src/kolibri2zim/templates/about.html diff --git a/kolibri2zim/templates/assets/document.js b/src/kolibri2zim/templates/assets/document.js similarity index 94% rename from kolibri2zim/templates/assets/document.js rename to src/kolibri2zim/templates/assets/document.js index a0826a0..b085bf6 100644 --- a/kolibri2zim/templates/assets/document.js +++ b/src/kolibri2zim/templates/assets/document.js @@ -6,4 +6,4 @@ function resizeFrameToFullHeight(){ frame.style.height = newHeight + 'px'; } window.addEventListener('resize', resizeFrameToFullHeight, {capture: true}); -resizeFrameToFullHeight(); \ No newline at end of file +resizeFrameToFullHeight(); diff --git a/kolibri2zim/templates/assets/epub_embed.css b/src/kolibri2zim/templates/assets/epub_embed.css similarity index 99% rename from kolibri2zim/templates/assets/epub_embed.css rename to src/kolibri2zim/templates/assets/epub_embed.css index 25aed6d..2a5e859 100644 --- a/kolibri2zim/templates/assets/epub_embed.css +++ b/src/kolibri2zim/templates/assets/epub_embed.css @@ -280,4 +280,4 @@ svg { #opener:hover { stroke: #777; fill: #777; -} \ No newline at end of file +} diff --git a/kolibri2zim/templates/assets/epub_embed.html b/src/kolibri2zim/templates/assets/epub_embed.html similarity index 100% rename from kolibri2zim/templates/assets/epub_embed.html rename to src/kolibri2zim/templates/assets/epub_embed.html diff --git a/kolibri2zim/templates/assets/epub_embed.js b/src/kolibri2zim/templates/assets/epub_embed.js similarity index 94% rename from kolibri2zim/templates/assets/epub_embed.js rename to src/kolibri2zim/templates/assets/epub_embed.js index 6e6c0fc..fd269c5 100644 --- a/kolibri2zim/templates/assets/epub_embed.js +++ b/src/kolibri2zim/templates/assets/epub_embed.js @@ -48,4 +48,4 @@ var params = URLSearchParams && new URLSearchParams(document.location.search.sub }; rendition.on("keyup", keyListener); - document.addEventListener("keyup", keyListener, false); \ No newline at end of file + document.addEventListener("keyup", keyListener, false); diff --git a/src/kolibri2zim/templates/assets/perseus_exercise.js b/src/kolibri2zim/templates/assets/perseus_exercise.js new file mode 100644 index 0000000..bce3f89 --- /dev/null +++ b/src/kolibri2zim/templates/assets/perseus_exercise.js @@ -0,0 +1 @@ +less = { env: 'development', logLevel: 1 }; diff --git a/kolibri2zim/templates/audio.html b/src/kolibri2zim/templates/audio.html similarity index 100% rename from kolibri2zim/templates/audio.html rename to src/kolibri2zim/templates/audio.html diff --git a/kolibri2zim/templates/base.html b/src/kolibri2zim/templates/base.html similarity index 100% rename from kolibri2zim/templates/base.html rename to src/kolibri2zim/templates/base.html diff --git a/kolibri2zim/templates/card.html b/src/kolibri2zim/templates/card.html similarity index 100% rename from kolibri2zim/templates/card.html rename to src/kolibri2zim/templates/card.html diff --git a/kolibri2zim/templates/document.html b/src/kolibri2zim/templates/document.html similarity index 99% rename from kolibri2zim/templates/document.html rename to src/kolibri2zim/templates/document.html index eec2b56..18dba86 100644 --- a/kolibri2zim/templates/document.html +++ b/src/kolibri2zim/templates/document.html @@ -49,7 +49,7 @@ {% endblock %} diff --git a/kolibri2zim/templates/epub.html b/src/kolibri2zim/templates/epub.html similarity index 99% rename from kolibri2zim/templates/epub.html rename to src/kolibri2zim/templates/epub.html index 306ca09..00f949c 100644 --- a/kolibri2zim/templates/epub.html +++ b/src/kolibri2zim/templates/epub.html @@ -8,5 +8,3 @@

You should get an epub reader here someday. In the mean time, just Open EPUB directly

- - diff --git a/kolibri2zim/templates/kolibri-logo.png b/src/kolibri2zim/templates/kolibri-logo.png similarity index 100% rename from kolibri2zim/templates/kolibri-logo.png rename to src/kolibri2zim/templates/kolibri-logo.png diff --git a/kolibri2zim/templates/node_meta.html b/src/kolibri2zim/templates/node_meta.html similarity index 100% rename from kolibri2zim/templates/node_meta.html rename to src/kolibri2zim/templates/node_meta.html diff --git a/kolibri2zim/templates/perseus_exercise.html b/src/kolibri2zim/templates/perseus_exercise.html similarity index 100% rename from kolibri2zim/templates/perseus_exercise.html rename to src/kolibri2zim/templates/perseus_exercise.html diff --git a/kolibri2zim/templates/topic.html b/src/kolibri2zim/templates/topic.html similarity index 100% rename from kolibri2zim/templates/topic.html rename to src/kolibri2zim/templates/topic.html diff --git a/kolibri2zim/templates/video.html b/src/kolibri2zim/templates/video.html similarity index 100% rename from kolibri2zim/templates/video.html rename to src/kolibri2zim/templates/video.html diff --git a/tasks.py b/tasks.py index 424223b..3370b73 100644 --- a/tasks.py +++ b/tasks.py @@ -7,6 +7,32 @@ use_pty = not os.getenv("CI", "") +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test(ctx: Context, args: str | None = ""): + """run tests (without coverage)""" + ctx.run(f"pytest {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test_cov(ctx: Context, args: str | None = ""): + """run test vith coverage""" + ctx.run(f"coverage run -m pytest {args}", pty=use_pty) + + +@task() +def report_cov(ctx: Context): + """report coverage""" + ctx.run("coverage combine", warn=True, pty=use_pty) + ctx.run("coverage report --show-missing", pty=use_pty) + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def coverage(ctx: Context, args: str | None = ""): + """run tests and report coverage""" + test_cov(ctx, args) + report_cov(ctx) + + @task( optional=["args"], help={"args": "linting tools (black, ruff) additional arguments"} ) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..0210377 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,60 @@ +from collections.abc import Callable, Generator +from typing import Any + +import pytest + +from kolibri2zim.scraper import Kolibri2Zim, KolibriDB +from kolibri2zim.scraper import options as expected_options_keys + +CHANNEL_NAME = "channel_name" +CHANNEL_DESCRIPTION = "a description" + + +class FakeDb(KolibriDB): + def __init__( + self, + channel_name: str, + channel_description: str, + channel_author: str | None, + ): + self.channel_name = channel_name + self.channel_description = channel_description + self.channel_author = channel_author + + def get_channel_metadata(self, _): + return { + "name": self.channel_name, + "description": self.channel_description, + "author": self.channel_author, + } + + +@pytest.fixture() +def scraper_generator() -> Generator[Callable[..., Kolibri2Zim], None, None]: + def _scraper( + channel_name: str = CHANNEL_NAME, + channel_description: str = CHANNEL_DESCRIPTION, + channel_author: str | None = None, + additional_options: dict[str, Any] = {}, + ) -> Kolibri2Zim: + options = {} + for option_key in expected_options_keys: + options[option_key] = None + options.update(additional_options) + scraper = Kolibri2Zim(**options) + scraper.db = FakeDb( + channel_author=channel_author, + channel_description=channel_description, + channel_name=channel_name, + ) + return scraper + + yield _scraper + + +# @pytest.fixture +# def default_options() -> Generator[dict[str, Any], None, None]: +# default_options = {} +# for option in options: +# default_options[option] = None +# yield default_options diff --git a/tests/test_sanitize_inputs.py b/tests/test_sanitize_inputs.py new file mode 100644 index 0000000..1504c12 --- /dev/null +++ b/tests/test_sanitize_inputs.py @@ -0,0 +1,193 @@ +import random +import string +from collections.abc import Callable + +import pytest +from zimscraperlib.constants import MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LEN +from zimscraperlib.constants import ( + MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LEN, +) + +from kolibri2zim.scraper import Kolibri2Zim + + +def randomword(length): + letters = string.ascii_lowercase + return "".join(random.choice(letters) for i in range(length)) # noqa: S311 + + +def test_sanitize_defaults_ok(scraper_generator: Callable[..., Kolibri2Zim]): + scraper = scraper_generator() + scraper.sanitize_inputs() + + +TEXT_NOT_USED = "text not used" + +LONG_TEXT = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor " + "incididunt ut labore et dolore magna aliqua. At erat pellentesque adipiscing " + "commodo elit at imperdiet. Rutrum tellus pellentesque eu tincidunt tortor aliquam" + " nulla facilisi. Eget lorem dolor sed viverra ipsum nunc. Ipsum nunc aliquet " + "bibendum enim facilisis gravida neque convallis. Aliquam malesuada bibendum arcu " + "vitae elementum curabitur. Platea dictumst quisque sagittis purus sit amet " + "volutpat. Blandit libero volutpat sed cras ornare. In eu mi bibendum neque " + "egestas. Egestas dui id ornare arcu odio. Pulvinar neque laoreet suspendisse " + "interdum. Fames ac turpis egestas integer eget aliquet nibh praesent tristique. Et" + " egestas quis ipsum suspendisse ultrices gravida dictum fusce. Malesuada fames ac " + "turpis egestas. Tincidunt nunc pulvinar sapien et ligula ullamcorper malesuada " + "proin libero. In arcu cursus euismod quis viverra. Faucibus in ornare quam viverra" + ". Curabitur vitae nunc sed velit dignissim sodales ut eu sem. Velit scelerisque in" + " dictum non consectetur a erat nam. Proin fermentum leo vel orci porta non. Fames" + " ac turpis egestas sed tempus. Vitae justo eget magna fermentum iaculis eu non. " + "Imperdiet massa tincidunt nunc pulvinar sapien et ligula. Laoreet sit amet cursus " + "sit amet dictum sit amet. Quis hendrerit dolor magna eget. Orci ac auctor augue " + "mauris augue. Consequat interdum varius sit amet mattis. At ultrices mi tempus " + "imperdiet nulla malesuada pellentesque elit. Volutpat est velit egestas dui. " + "Potenti nullam ac tortor vitae. At tempor commodo ullamcorper a lacus vestibulum " + "sed arcu non. Duis ut diam quam nulla. Vestibulum mattis ullamcorper velit sed " + "ullamcorper. Sit amet commodo nulla facilisi nullam vehicula. Faucibus purus in " + "massa tempor nec feugiat. Sem fringilla ut morbi tincidunt augue interdum velit. " + "Etiam dignissim diam quis enim lobortis scelerisque fermentum dui. Nunc vel risus " + "commodo viverra maecenas accumsan. Aenean sed adipiscing diam donec adipiscing " + "tristique. Maecenas accumsan lacus vel facilisis volutpat est velit egestas. Nulla" + " aliquet porttitor lacus luctus accumsan tortor posuere ac. Habitant morbi " + "tristique senectus et netus et. Eget mi proin sed libero enim sed faucibus turpis " + "in. Vulputate enim nulla aliquet porttitor lacus. Dui ut ornare lectus sit amet " + "est. Quam lacus suspendisse faucibus interdum posuere. Sagittis orci a scelerisque" + " purus semper eget duis at tellus. Tellus molestie nunc non blandit massa. Feugiat" + " vivamus at augue eget arcu dictum varius duis at. Varius morbi enim nunc faucibus" + " a pellentesque sit. Id aliquet lectus proin nibh nisl condimentum id venenatis a." + " Tortor dignissim convallis aenean et tortor at risus viverra adipiscing. Aliquam " + "malesuada bibendum arcu vitae elementum curabitur vitae nunc sed. Habitasse platea" + " dictumst quisque sagittis purus sit amet volutpat. Vitae auctor eu augue ut " + "lectus. At varius vel pharetra vel turpis nunc eget. Dictum at tempor commodo " + "ullamcorper a lacus vestibulum sed arcu. Pellentesque massa placerat duis " + "ultricies. Enim nunc faucibus a pellentesque sit amet porttitor eget dolor. " + "Volutpat blandit aliquam etiam erat velit scelerisque in. Amet mattis vulputate " + "enim nulla aliquet porttitor. Egestas maecenas pharetra convallis posuere morbi " + "leo urna molestie. Duis ut diam quam nulla porttitor massa id. In fermentum " + "posuere urna nec tincidunt praesent. Turpis egestas sed tempus urna et pharetra " + "pharetra massa. Tellus molestie nunc non blandit massa. Diam phasellus vestibulum " + "lorem sed risus ultricies. Egestas erat imperdiet sed euismod nisi porta lorem. " + "Quam viverra orci sagittis eu volutpat odio facilisis mauris sit. Ornare aenean " + "euismod elementum nisi quis. Laoreet non curabitur gravida arcu ac tortor " + "dignissim convallis aenean. Sagittis aliquam malesuada bibendum arcu vitae " + "elementum. Sed blandit libero volutpat sed cras ornare. Sagittis eu volutpat odio " + "facilisis mauris. Facilisis volutpat est velit egestas dui id ornare arcu odio. " + "Eu feugiat pretium nibh." +) + + +@pytest.mark.parametrize( + "cli_description, cli_long_description, channel_description, raises, " + "expected_description, expected_long_description", + [ + # CLI description set and is short, CLI long descripion not set, channel + # description doe not matter + ( + LONG_TEXT[0:MAX_DESC_LEN], + None, + TEXT_NOT_USED, + False, + LONG_TEXT[0:MAX_DESC_LEN], + None, + ), + # CLI description set and is too long, channel description doe not matter + (LONG_TEXT[0 : MAX_DESC_LEN + 1], None, TEXT_NOT_USED, True, None, None), + # CLI description not set and channel description is short enough + (None, None, LONG_TEXT[0:MAX_DESC_LEN], False, LONG_TEXT[0:MAX_DESC_LEN], None), + # CLI description not set and channel description is too long for description + # but ok for long description + ( + None, + None, + LONG_TEXT[0 : MAX_DESC_LEN + 1], + False, + LONG_TEXT[0 : MAX_DESC_LEN - 1] + "…", + LONG_TEXT[0 : MAX_DESC_LEN + 1], + ), + ( + None, + None, + LONG_TEXT[0:MAX_LONG_DESC_LEN], + False, + LONG_TEXT[0 : MAX_DESC_LEN - 1] + "…", + LONG_TEXT[0:MAX_LONG_DESC_LEN], + ), + # CLI description not set and channel description is too long for description + # and long description + ( + None, + None, + LONG_TEXT[0 : MAX_LONG_DESC_LEN + 1], + False, + LONG_TEXT[0 : MAX_DESC_LEN - 1] + "…", + LONG_TEXT[0 : MAX_LONG_DESC_LEN - 1] + "…", + ), + # CLI description set and is short, CLI long descripion set and is short, + # channel description does not matter + ( + LONG_TEXT[0:MAX_DESC_LEN], + LONG_TEXT[0:MAX_LONG_DESC_LEN], + TEXT_NOT_USED, + False, + LONG_TEXT[0:MAX_DESC_LEN], + LONG_TEXT[0:MAX_LONG_DESC_LEN], + ), + # CLI description set and is short, CLI long descripion set and is too long, + # channel description does not matter + ( + LONG_TEXT[0:MAX_DESC_LEN], + LONG_TEXT[0 : MAX_LONG_DESC_LEN + 1], + TEXT_NOT_USED, + True, + None, + None, + ), + # CLI description not set, CLI long descripion set and is short, + # channel description does not matter + ( + None, + LONG_TEXT[0:MAX_LONG_DESC_LEN], + TEXT_NOT_USED, + True, + None, + None, + ), + ], +) +def test_description( + cli_description: str, + cli_long_description: str, + channel_description: str, + *, + raises: bool, + expected_description: str, + expected_long_description: str, + scraper_generator: Callable[..., Kolibri2Zim], +): + if channel_description: + scraper = scraper_generator( + channel_description=channel_description, + additional_options={ + "description": cli_description, + "long_description": cli_long_description, + }, + ) + else: + scraper = scraper_generator( + additional_options={ + "description": cli_description, + "long_description": cli_long_description, + } + ) + + if raises: + with pytest.raises(ValueError): + scraper.sanitize_inputs() + return + else: + scraper.sanitize_inputs() + + assert scraper.description == expected_description + assert scraper.long_description == expected_long_description From 33cab630979c0f6f325e1441f690c61dc78f1e05 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 17:10:39 +0200 Subject: [PATCH 33/45] Fix wrong method for items --- src/kolibri2zim/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py index 1bf0fcb..d96cde6 100644 --- a/src/kolibri2zim/scraper.py +++ b/src/kolibri2zim/scraper.py @@ -282,7 +282,7 @@ def funnel_from_s3(self, file_id, path, checksum, preset): "fileobj": fileobj, "mimetype": preset.mimetype, } - self.creator.add_item_for(StaticItem(**kwargs)) + self.creator.add_item(StaticItem(**kwargs)) logger.debug(f"Added {path} from S3::{key}") return True From f69c68a9f4300de571bb0972dc10c082ddda6838 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 17:11:23 +0200 Subject: [PATCH 34/45] Move code back to its original position, should not have been moved --- src/kolibri2zim/scraper.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py index d96cde6..7415e53 100644 --- a/src/kolibri2zim/scraper.py +++ b/src/kolibri2zim/scraper.py @@ -753,6 +753,25 @@ def add_html5_node(self, node_id): logger.debug(f"Added HTML5 node #{node_id}") def run(self): + if self.s3_url_with_credentials and not self.s3_credentials_ok(): + raise ValueError("Unable to connect to Optimization Cache. Check its URL.") + + s3_msg = ( + f" using cache: {self.s3_storage.url.netloc} " + f"with bucket: {self.s3_storage.bucket_name}" + if self.s3_storage + else "" + ) + logger.info( + f"Starting scraper with:\n" + f" channel_id: {self.channel_id}\n" + f" build_dir: {self.build_dir}\n" + f" output_dir: {self.output_dir}\n" + f" using webm : {self.use_webm}\n" + f" low_quality : {self.low_quality}\n" + f"{s3_msg}" + ) + self.ensure_js_deps_are_present() logger.info("Download database") @@ -1093,22 +1112,3 @@ def ensure_js_deps_are_present(self): "It looks like JS deps have not been installed," f" {js_deps_dir} is missing" ) - - if self.s3_url_with_credentials and not self.s3_credentials_ok(): - raise ValueError("Unable to connect to Optimization Cache. Check its URL.") - - s3_msg = ( - f" using cache: {self.s3_storage.url.netloc} " - f"with bucket: {self.s3_storage.bucket_name}" - if self.s3_storage - else "" - ) - logger.info( - f"Starting scraper with:\n" - f" channel_id: {self.channel_id}\n" - f" build_dir: {self.build_dir}\n" - f" output_dir: {self.output_dir}\n" - f" using webm : {self.use_webm}\n" - f" low_quality : {self.low_quality}\n" - f"{s3_msg}" - ) From f6ede9f9e4dbd0d0b037615d14652e164a293f85 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 17:11:58 +0200 Subject: [PATCH 35/45] Simplify ensure_js_deps_are_present --- src/kolibri2zim/constants.py | 14 ++++++++++++++ src/kolibri2zim/scraper.py | 30 ++++-------------------------- 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/src/kolibri2zim/constants.py b/src/kolibri2zim/constants.py index f8a27fb..53e0aa5 100644 --- a/src/kolibri2zim/constants.py +++ b/src/kolibri2zim/constants.py @@ -20,6 +20,20 @@ STUDIO_DEFAULT_BASE_URL = "https://studio.learningequality.org" STUDIO_URL = os.getenv("STUDIO_URL", STUDIO_DEFAULT_BASE_URL) +# when modifiying this list, update list in hatch_build.py as well +JS_DEPS: list[str] = [ + "pdfjs", + "videojs", + "ogvjs", + "bootstrap", + "bootstrap-icons", + "perseus", + "epub.min.js", + "jszip.min.js", + "jquery.min.js", + "videojs-ogvjs.js", +] + def is_running_inside_container(): fpath = pathlib.Path("/proc/self/cgroup") diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py index 7415e53..c72d7ea 100644 --- a/src/kolibri2zim/scraper.py +++ b/src/kolibri2zim/scraper.py @@ -7,7 +7,6 @@ import hashlib import io import json -import os import shutil import tempfile import threading @@ -33,7 +32,7 @@ from zimscraperlib.zim.creator import Creator from zimscraperlib.zim.items import StaticItem -from kolibri2zim.constants import ROOT_DIR, STUDIO_URL, get_logger +from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, get_logger from kolibri2zim.database import KolibriDB from kolibri2zim.debug import ( ON_DISK_THRESHOLD, @@ -1085,30 +1084,9 @@ def add_custom_about_and_css(self): logger.debug("Added about page and custom CSS") def ensure_js_deps_are_present(self): - for js_deps_file in [ - "epub.min.js", - "jszip.min.js", - "jquery.min.js", - "videojs-ogvjs.js", - ]: - if not os.path.exists( - self.templates_dir.joinpath(f"assets/{js_deps_file}") - ): - raise ValueError( - "It looks like JS deps have not been installed," - f" {js_deps_file} is missing" - ) - - for js_deps_dir in [ - "pdfjs", - "videojs", - "ogvjs", - "bootstrap", - "bootstrap-icons", - "perseus", - ]: - if not os.path.exists(self.templates_dir.joinpath(f"assets/{js_deps_dir}")): + for dep in JS_DEPS: + if not self.templates_dir.joinpath(f"assets/{dep}").exists(): raise ValueError( "It looks like JS deps have not been installed," - f" {js_deps_dir} is missing" + f" {dep} is missing" ) From 0a4082dbec93279f14fa1e85a2d3ac36592eadcf Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 17:48:24 +0200 Subject: [PATCH 36/45] Ensure assets are not downloaded twice at build time --- .github/workflows/publish.yml | 2 +- Dockerfile | 2 +- hatch_build.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 7fdc962..108c895 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -22,7 +22,7 @@ jobs: - name: Build packages run: | pip install -U pip hatch - HATCH_BUILD_HOOKS_ENABLE=true hatch build + hatch build - name: Upload to PyPI uses: pypa/gh-action-pypi-publish@release/v1.8 diff --git a/Dockerfile b/Dockerfile index 89a7302..e38f96d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN apt-get update -y \ #RUN pip3 install --no-cache-dir -r /src/requirements.txt COPY kolibri2zim /src/kolibri2zim COPY pyproject.toml *.md get_js_deps.sh install.sh MANIFEST.in LICENSE *.py /src/ -RUN cd /src/ && HATCH_BUILD_HOOKS_ENABLE=true hatch build -t sdist && ./install.sh +RUN cd /src/ && hatch build -t sdist && ./install.sh # default output directory RUN mkdir -p /output diff --git a/hatch_build.py b/hatch_build.py index d98f9e3..1ecbbe0 100644 --- a/hatch_build.py +++ b/hatch_build.py @@ -4,13 +4,42 @@ from hatchling.builders.hooks.plugin.interface import BuildHookInterface +logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) +# update list in constants.py as well +JS_DEPS = [ + "pdfjs", + "videojs", + "ogvjs", + "bootstrap", + "bootstrap-icons", + "perseus", + "epub.min.js", + "jszip.min.js", + "jquery.min.js", + "videojs-ogvjs.js", +] + class GetJsDepsHook(BuildHookInterface): def initialize(self, version, build_data): + if self.deps_already_installed(): + logger.info("JS dependencies are already installed, skipping it") + return + Path(self.root).joinpath("src/kolibri2zim/templates/assets") subprocess.run( str(Path(self.root).joinpath("get_js_deps.sh")), # noqa : S603 check=True, ) return super().initialize(version, build_data) + + def deps_already_installed(self) -> bool: + for dep in JS_DEPS: + if ( + not Path(self.root) + .joinpath(f"src/kolibri2zim/templates/assets/{dep}") + .exists() + ): + return False + return True From 69200e4e28be4333d96bf5c5aac7117cfec446d5 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 18:18:35 +0200 Subject: [PATCH 37/45] Adapt to Python bootstrap changes --- Dockerfile | 29 +++++---- install.sh | 5 -- pyproject.toml | 160 ++++++++++++++++++++++++------------------------- 3 files changed, 94 insertions(+), 100 deletions(-) delete mode 100755 install.sh diff --git a/Dockerfile b/Dockerfile index e38f96d..743bd2b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,23 @@ -FROM python:3.11-bullseye -LABEL org.opencontainers.image.source https://github.com/openzim/kolibri2zim +FROM python:3.11-bookworm +LABEL org.opencontainers.image.source https://github.com/openzim/kolibri # Install necessary packages -RUN apt-get update -y \ - && apt-get install -y --no-install-recommends locales-all unzip ffmpeg \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* \ - && python -m pip install -U pip hatch +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + locales-all \ + unzip \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* \ + && python -m pip install --no-cache-dir -U \ + pip -#COPY requirements.txt /src/ -#RUN pip3 install --no-cache-dir -r /src/requirements.txt -COPY kolibri2zim /src/kolibri2zim -COPY pyproject.toml *.md get_js_deps.sh install.sh MANIFEST.in LICENSE *.py /src/ -RUN cd /src/ && hatch build -t sdist && ./install.sh +# Copy code + associated artifacts +COPY src /src/src +COPY pyproject.toml *.md get_js_deps.sh MANIFEST.in LICENSE *.py /src/ + +# Install + cleanup +RUN pip install --no-cache-dir /src \ + && rm -rf /src # default output directory RUN mkdir -p /output diff --git a/install.sh b/install.sh deleted file mode 100755 index 0b6a66b..0000000 --- a/install.sh +++ /dev/null @@ -1,5 +0,0 @@ -search_dir=/src/dist/*.tar.gz -for entry in $search_dir -do - pip install "$entry" -done diff --git a/pyproject.toml b/pyproject.toml index 6bd8c4f..d09a6e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,12 +34,13 @@ test = [ "coverage==7.2.7", ] dev = [ - "debugpy", + "pre-commit==3.3.3", + "debugpy==1.6.7", "kolibri2zim[scripts]", "kolibri2zim[lint]", "kolibri2zim[test]", "kolibri2zim[check]", - "hatchling", + "hatchling==1.18.0", ] [project.urls] @@ -99,84 +100,81 @@ target-version = ['py311'] [tool.ruff] target-version = "py311" line-length = 88 -src = ["kolibri2zim"] +src = ["src"] select = [ - # "A", # flake8-builtins - # "ANN", # flake8-annotations - "ARG", # flake8-unused-arguments - # "ASYNC", # flake8-async - # "B", # flake8-bugbear - # "BLE", # flake8-blind-except - "C4", # flake8-comprehensions - "C90", # mccabe - # "COM", # flake8-commas - # "D", # pydocstyle - # "DJ", # flake8-django - "DTZ", # flake8-datetimez - "E", # pycodestyle (default) - "EM", # flake8-errmsg - # "ERA", # eradicate - # "EXE", # flake8-executable - "F", # Pyflakes (default) - # "FA", # flake8-future-annotations - "FBT", # flake8-boolean-trap - # "FLY", # flynt - # "G", # flake8-logging-format - "I", # isort - "ICN", # flake8-import-conventions - # "INP", # flake8-no-pep420 - # "INT", # flake8-gettext - "ISC", # flake8-implicit-str-concat - "N", # pep8-naming - # "NPY", # NumPy-specific rules - # "PD", # pandas-vet - # "PGH", # pygrep-hooks - # "PIE", # flake8-pie - # "PL", # Pylint - "PLC", # Pylint: Convention - "PLE", # Pylint: Error - "PLR", # Pylint: Refactor - "PLW", # Pylint: Warning - # "PT", # flake8-pytest-style - # "PTH", # flake8-use-pathlib - # "PYI", # flake8-pyi - "Q", # flake8-quotes - # "RET", # flake8-return - # "RSE", # flake8-raise - "RUF", # Ruff-specific rules - "S", # flake8-bandit - # "SIM", # flake8-simplify - # "SLF", # flake8-self - "T10", # flake8-debugger - "T20", # flake8-print - # "TCH", # flake8-type-checking - # "TD", # flake8-todos - "TID", # flake8-tidy-imports - # "TRY", # tryceratops - "UP", # pyupgrade - "W", # pycodestyle - "YTT", # flake8-2020 + "A", # flake8-builtins + # "ANN", # flake8-annotations + "ARG", # flake8-unused-arguments + # "ASYNC", # flake8-async + "B", # flake8-bugbear + # "BLE", # flake8-blind-except + "C4", # flake8-comprehensions + "C90", # mccabe + # "COM", # flake8-commas + # "D", # pydocstyle + # "DJ", # flake8-django + "DTZ", # flake8-datetimez + "E", # pycodestyle (default) + "EM", # flake8-errmsg + # "ERA", # eradicate + # "EXE", # flake8-executable + "F", # Pyflakes (default) + # "FA", # flake8-future-annotations + "FBT", # flake8-boolean-trap + # "FLY", # flynt + # "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + # "INP", # flake8-no-pep420 + # "INT", # flake8-gettext + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + # "NPY", # NumPy-specific rules + # "PD", # pandas-vet + # "PGH", # pygrep-hooks + # "PIE", # flake8-pie + # "PL", # Pylint + "PLC", # Pylint: Convention + "PLE", # Pylint: Error + "PLR", # Pylint: Refactor + "PLW", # Pylint: Warning + # "PT", # flake8-pytest-style + # "PTH", # flake8-use-pathlib + # "PYI", # flake8-pyi + "Q", # flake8-quotes + # "RET", # flake8-return + # "RSE", # flake8-raise + "RUF", # Ruff-specific rules + "S", # flake8-bandit + # "SIM", # flake8-simplify + # "SLF", # flake8-self + "T10", # flake8-debugger + "T20", # flake8-print + # "TCH", # flake8-type-checking + # "TD", # flake8-todos + "TID", # flake8-tidy-imports + # "TRY", # tryceratops + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 ] ignore = [ - # Allow non-abstract empty methods in abstract base classes - "B027", - "EM", - # Allow boolean positional values in function calls, like `dict.get(... True)` - "FBT003", - # Ignore checks for possible passwords - "S105", - "S106", - "S107", - # Ignore complexity - "C901", - "PLR0911", - "PLR0912", - "PLR0913", - "PLR0915", + # Allow non-abstract empty methods in abstract base classes + "B027", + # Remove flake8-errmsg since we consider they bloat the code and provide limited value + "EM", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore warnings on subprocess.run / popen + "S603", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", ] unfixable = [ - # Don't touch unused imports - "F401", + # Don't touch unused imports + "F401", ] [tool.ruff.isort] @@ -214,16 +212,12 @@ exclude_lines = [ ] [tool.pyright] -pythonVersion = "3.11" -pythonPlatform = "All" -typeCheckingMode = "basic" - include = ["src", "tests", "tasks.py"] exclude = ["**/node_modules", "**/__pycache__", "src/kolibri2zim/templates", ] - -executionEnvironments= [ - { root= "src" } -] +extraPaths = ["src"] +pythonVersion = "3.11" +pythonPlatform = "All" +typeCheckingMode="basic" From 6896f310d7f1fc9ae92f3562101196ab063bbd83 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 18:31:12 +0200 Subject: [PATCH 38/45] Update pyright to 1.1.318 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d09a6e2..d3a39d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dynamic = ["version"] [project.optional-dependencies] scripts = ["invoke==2.1.3"] lint = ["black==23.3.0", "ruff==0.0.272"] -check = ["pyright==1.1.317"] +check = ["pyright==1.1.318"] test = [ "pytest==7.4.0", "coverage==7.2.7", From aa794c258e7d783a696de04ee533d895cd2a868b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 18:31:38 +0200 Subject: [PATCH 39/45] Fix few quality issues --- dump_channel_to_fs.py | 2 +- hatch_build.py | 2 +- src/kolibri2zim/entrypoint.py | 2 +- tests/conftest.py | 13 +++---------- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py index ad32cc0..4c67395 100755 --- a/dump_channel_to_fs.py +++ b/dump_channel_to_fs.py @@ -44,7 +44,7 @@ def download_if_missing(url, fpath, fsize=None, *, force=False): if not skipped: fpath.unlink(missing_ok=True) wget = subprocess.run( - [ # noqa: S603 + [ "/usr/bin/env", "wget", "-t", diff --git a/hatch_build.py b/hatch_build.py index 1ecbbe0..314e24a 100644 --- a/hatch_build.py +++ b/hatch_build.py @@ -29,7 +29,7 @@ def initialize(self, version, build_data): return Path(self.root).joinpath("src/kolibri2zim/templates/assets") subprocess.run( - str(Path(self.root).joinpath("get_js_deps.sh")), # noqa : S603 + str(Path(self.root).joinpath("get_js_deps.sh")), # : S603 check=True, ) return super().initialize(version, build_data) diff --git a/src/kolibri2zim/entrypoint.py b/src/kolibri2zim/entrypoint.py index 856bea2..34bc34b 100755 --- a/src/kolibri2zim/entrypoint.py +++ b/src/kolibri2zim/entrypoint.py @@ -206,7 +206,7 @@ def main(): logger.error(f"FAILED. An error occurred: {exc}") if args.debug: logger.exception(exc) - raise SystemExit(1) + raise SystemExit(1) from exc if __name__ == "__main__": diff --git a/tests/conftest.py b/tests/conftest.py index 0210377..af47abf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,12 +35,13 @@ def _scraper( channel_name: str = CHANNEL_NAME, channel_description: str = CHANNEL_DESCRIPTION, channel_author: str | None = None, - additional_options: dict[str, Any] = {}, + additional_options: dict[str, Any] | None = None, ) -> Kolibri2Zim: options = {} for option_key in expected_options_keys: options[option_key] = None - options.update(additional_options) + if additional_options: + options.update(additional_options) scraper = Kolibri2Zim(**options) scraper.db = FakeDb( channel_author=channel_author, @@ -50,11 +51,3 @@ def _scraper( return scraper yield _scraper - - -# @pytest.fixture -# def default_options() -> Generator[dict[str, Any], None, None]: -# default_options = {} -# for option in options: -# default_options[option] = None -# yield default_options From e57b03d183498ed07eacb76647bf02938cc3fd87 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Jul 2023 18:34:02 +0200 Subject: [PATCH 40/45] Make it clear why we need hatchling in dev environment --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index d3a39d4..860390e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dev = [ "kolibri2zim[lint]", "kolibri2zim[test]", "kolibri2zim[check]", + # hatchling is a dev dependency only needed for hook development on developer machine "hatchling==1.18.0", ] From 319d9f9efba39a9aa880ea259a95c8c6fcc261ce Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 25 Jul 2023 08:03:22 +0200 Subject: [PATCH 41/45] Use build instead of hatch --- .github/workflows/publish.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 108c895..98c52fc 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,8 +21,8 @@ jobs: - name: Build packages run: | - pip install -U pip hatch - hatch build + pip install -U pip build + python -m build sdist wheel - name: Upload to PyPI uses: pypa/gh-action-pypi-publish@release/v1.8 @@ -31,7 +31,6 @@ jobs: uses: openzim/docker-publish-action@v10 with: image-name: openzim/kolibri - on-master: dev tag-pattern: /^v([0-9.]+)$/ latest-on-tag: true restrict-to: openzim/kolibri From 2673c688f8b653117030b3e1eef2998ade5473ef Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 25 Jul 2023 08:07:37 +0200 Subject: [PATCH 42/45] Small fixes / change revert following review --- hatch_build.py | 5 +++-- pyproject.toml | 2 ++ src/kolibri2zim/scraper.py | 44 ++++++++++++++++++-------------------- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/hatch_build.py b/hatch_build.py index 314e24a..86dfa52 100644 --- a/hatch_build.py +++ b/hatch_build.py @@ -29,7 +29,7 @@ def initialize(self, version, build_data): return Path(self.root).joinpath("src/kolibri2zim/templates/assets") subprocess.run( - str(Path(self.root).joinpath("get_js_deps.sh")), # : S603 + str(Path(self.root).joinpath("get_js_deps.sh")), check=True, ) return super().initialize(version, build_data) @@ -38,7 +38,8 @@ def deps_already_installed(self) -> bool: for dep in JS_DEPS: if ( not Path(self.root) - .joinpath(f"src/kolibri2zim/templates/assets/{dep}") + .joinpath("src/kolibri2zim/templates/assets") + .joinpath(dep) .exists() ): return False diff --git a/pyproject.toml b/pyproject.toml index 860390e..96879ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,6 +162,8 @@ select = [ ignore = [ # Allow non-abstract empty methods in abstract base classes "B027", + # Allow use of datetime with tz and date.today + "DTZ005", "DTZ011", # Remove flake8-errmsg since we consider they bloat the code and provide limited value "EM", # Allow boolean positional values in function calls, like `dict.get(... True)` diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py index c72d7ea..11a28db 100644 --- a/src/kolibri2zim/scraper.py +++ b/src/kolibri2zim/scraper.py @@ -32,7 +32,7 @@ from zimscraperlib.zim.creator import Creator from zimscraperlib.zim.items import StaticItem -from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, get_logger +from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, Global, get_logger from kolibri2zim.database import KolibriDB from kolibri2zim.debug import ( ON_DISK_THRESHOLD, @@ -108,11 +108,12 @@ def go(option): # zim params self.fname = go("fname") - tags = go("tags") - if tags is None: - self.tags = [] - else: - self.tags = [t.strip() for t in tags.split(",")] + self.tags = ( + [] + if go("tags") is None + else [t.strip() for t in go("tags").split(",")] # pyright: ignore + ) + self.title = go("title") self.description = go("description") self.long_description = go("long_description") @@ -126,17 +127,14 @@ def go(option): self.css = go("css") # directory setup - self.output_dir = Path(str(go("output_dir"))).expanduser().resolve() - tmp_dir = go("tmp_dir") - if tmp_dir: - Path(tmp_dir).mkdir(parents=True, exist_ok=True) - self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir)) + self.output_dir = Path(go("output_dir") or "/output").expanduser().resolve() + if go("tmp_dir"): + Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True) # pyright: ignore + self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir"))) # performances options - nb_threads_str = go("threads") - self.nb_threads = int(nb_threads_str) if nb_threads_str else None - nb_processes_str = go("processes") - self.nb_processes = int(nb_processes_str) if nb_processes_str else None + self.nb_threads = int(go("threads") or 1) + self.nb_processes = int(go("processes") or Global.nb_available_cpus) self.s3_url_with_credentials = go("s3_url_with_credentials") self.s3_storage = None self.dedup_html_files = go("dedup_html_files") @@ -146,9 +144,10 @@ def go(option): self.keep_build_dir = go("keep_build_dir") self.debug = go("debug") self.only_topics = go("only_topics") - node_ids = go("node_ids") self.node_ids = ( - None if node_ids is None else [t.strip() for t in node_ids.split(",")] + None + if go("node_ids") is None + else [t.strip() for t in go("node_ids").split(",")] # pyright: ignore ) # jinja2 environment setup @@ -823,7 +822,7 @@ def run(self): LongDescription=self.long_description, Creator=self.author, Publisher=self.publisher, - Date=datetime.datetime.now(datetime.UTC), + Date=datetime.date.today(), Illustration_48x48_at_1=self.favicon_48_fpath.read_bytes(), ) self.creator.start() @@ -871,9 +870,8 @@ def run(self): f"FAILURE not_done={len(result.not_done)} done={len(result.done)}" ) for future in result.done: - future_exception = future.exception() - if future_exception: - raise future_exception + if future.exception(): + raise future.exception() # pyright:ignore except KeyboardInterrupt: self.creator.can_finish = False logger.error("KeyboardInterrupt, exiting.") @@ -929,12 +927,12 @@ def sanitize_inputs(self): channel_meta = self.db.get_channel_metadata(self.channel_id) # input & metadata sanitation - period = datetime.datetime.now(datetime.UTC).strftime("%Y-%m") + period = datetime.datetime.now().strftime("%Y-%m") if self.fname: # make sure we were given a filename and not a path fname_path = Path(str(self.fname).format(period=period)) if Path(fname_path.name) != fname_path: - raise ValueError(f"filename is not a filename: {fname_path}") + raise ValueError(f"filename is not a filename: {self.fname}") self.clean_fname = str(fname_path) else: self.clean_fname = f"{self.name}_{period}.zim" From dfaf970ad1f2911c162280103685e6b39975855e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 25 Jul 2023 08:08:28 +0200 Subject: [PATCH 43/45] Simplify code + add support for seting only the long description --- src/kolibri2zim/scraper.py | 47 ++++++++++++++--------------------- tests/test_sanitize_inputs.py | 12 ++++----- 2 files changed, 24 insertions(+), 35 deletions(-) diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py index 11a28db..412bcf4 100644 --- a/src/kolibri2zim/scraper.py +++ b/src/kolibri2zim/scraper.py @@ -941,36 +941,25 @@ def sanitize_inputs(self): self.title = channel_meta["name"] self.title = self.title.strip() + if self.description and len(self.description) > MAX_DESC_LENGTH: + raise ValueError( + f"Description too long ({len(self.description)}>{MAX_DESC_LENGTH})" + ) + if self.long_description and len(self.long_description) > MAX_LONG_DESC_LENGTH: + raise ValueError( + f"LongDescription too long ({len(self.long_description)}" + f">{MAX_LONG_DESC_LENGTH})" + ) + + kolibri_desc = channel_meta["description"].strip() + if not self.long_description and len(kolibri_desc) > MAX_DESC_LENGTH: + self.long_description = kolibri_desc[0:MAX_LONG_DESC_LENGTH] + if len(kolibri_desc) > MAX_LONG_DESC_LENGTH: + self.long_description = self.long_description[:-1] + "…" if not self.description: - # User did not provided a description, we will infer it from channel - # metadata, limited to maximum length - if self.long_description: - raise ValueError( - "long_description cannot be set if description is not set" - ) - self.description = channel_meta["description"].strip() - if len(self.description) > MAX_DESC_LENGTH: - self.long_description = self.description - self.description = f"{self.description[0:MAX_DESC_LENGTH-1]}…" - if len(self.long_description) > MAX_LONG_DESC_LENGTH: - self.long_description = ( - f"{self.long_description[0:MAX_LONG_DESC_LENGTH-1]}…" - ) - else: - self.description = self.description.strip() - if len(self.description) > MAX_DESC_LENGTH: - raise ValueError( - f"description is too long ({len(self.description)}" - f">{MAX_DESC_LENGTH})" - ) - if ( - self.long_description - and len(self.long_description) > MAX_LONG_DESC_LENGTH - ): - raise ValueError( - f"long_description is too long ({len(self.long_description)}" - f">{MAX_LONG_DESC_LENGTH})" - ) + self.description = kolibri_desc[0:MAX_DESC_LENGTH] + if len(kolibri_desc) > MAX_DESC_LENGTH: + self.description = self.description[:-1] + "…" if not self.author: self.author = channel_meta["author"] or "Kolibri" diff --git a/tests/test_sanitize_inputs.py b/tests/test_sanitize_inputs.py index 1504c12..0569c09 100644 --- a/tests/test_sanitize_inputs.py +++ b/tests/test_sanitize_inputs.py @@ -92,7 +92,7 @@ def test_sanitize_defaults_ok(scraper_generator: Callable[..., Kolibri2Zim]): LONG_TEXT[0:MAX_DESC_LEN], None, ), - # CLI description set and is too long, channel description doe not matter + # CLI description set and is too long, channel description does not matter (LONG_TEXT[0 : MAX_DESC_LEN + 1], None, TEXT_NOT_USED, True, None, None), # CLI description not set and channel description is short enough (None, None, LONG_TEXT[0:MAX_DESC_LEN], False, LONG_TEXT[0:MAX_DESC_LEN], None), @@ -145,14 +145,14 @@ def test_sanitize_defaults_ok(scraper_generator: Callable[..., Kolibri2Zim]): None, ), # CLI description not set, CLI long descripion set and is short, - # channel description does not matter + # channel description set to something different than long desc ( None, LONG_TEXT[0:MAX_LONG_DESC_LEN], - TEXT_NOT_USED, - True, - None, - None, + LONG_TEXT[10:MAX_LONG_DESC_LEN], + False, + LONG_TEXT[10 : MAX_DESC_LEN + 9] + "…", + LONG_TEXT[0:MAX_LONG_DESC_LEN], ), ], ) From b005ed25ca3e44474d6ce49c79d4d35989daea47 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 25 Jul 2023 10:50:50 +0200 Subject: [PATCH 44/45] Fix CHANGELOG to add latest changes --- CHANGELOG.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index adbabde..a5753f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Add `--long-description` CLI parameter to set ZIM long description +- Add `--node-ids` CLI parameter to process only few channel nodes (_useful for debugging mostly_) + ### Fixed +- Fixed issue with ZIM description too long when sourced from channel metadata +- Fixed issue with ZIM icon sizes / formats - Fix issue with ePub rendering which was outside the iframe - Description is now limited to expected lenght and long description is set - Icons and illustrations are squared as expected @@ -17,11 +23,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Migrate to our new Python standard (hatch, ruff, pyright, ...) - Using zimscraperlib 3.1.1 -- Updated image to `python:3.11-bullseye` +- Updated image to `python:3.11-bookworm` - Retry video reencoding up to three times - Move inline javascript to dedicated files - Move huge inline CSS to dedicated file -- Add `--node-ids` CLI parameter to process only few nodes (useful for debugging) ## [1.0.1] - 2023-02-22 @@ -34,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [1.0.0] - 2021-11-11 +### Added - initial version - supports topic/document/audio/video/html5/exercise content types - uses libzim7 From 9e4a610e2bdab2ff15d222f8c4b4b802fb38ef12 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 25 Jul 2023 13:15:59 +0200 Subject: [PATCH 45/45] Use today() as it is already used somewhere else and sufficient --- pyproject.toml | 4 ++-- src/kolibri2zim/scraper.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 96879ec..659a2d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,8 +162,8 @@ select = [ ignore = [ # Allow non-abstract empty methods in abstract base classes "B027", - # Allow use of datetime with tz and date.today - "DTZ005", "DTZ011", + # Allow use of date.today + "DTZ011", # Remove flake8-errmsg since we consider they bloat the code and provide limited value "EM", # Allow boolean positional values in function calls, like `dict.get(... True)` diff --git a/src/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py index 412bcf4..e812305 100644 --- a/src/kolibri2zim/scraper.py +++ b/src/kolibri2zim/scraper.py @@ -927,7 +927,7 @@ def sanitize_inputs(self): channel_meta = self.db.get_channel_metadata(self.channel_id) # input & metadata sanitation - period = datetime.datetime.now().strftime("%Y-%m") + period = datetime.date.today().strftime("%Y-%m") if self.fname: # make sure we were given a filename and not a path fname_path = Path(str(self.fname).format(period=period))