diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml new file mode 100644 index 0000000..53ace14 --- /dev/null +++ b/.github/workflows/Tests.yml @@ -0,0 +1,38 @@ +name: Tests + +on: + pull_request: + push: + branches: + - main + +jobs: + run-tests: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[test,scripts] + + - name: Run the tests + run: inv coverage --args "-vvv" + + - name: Upload coverage report to codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + + - name: Ensure we can build targets + run: | + pip install build + python3 -m build diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml deleted file mode 100644 index 2ca5fb4..0000000 --- a/.github/workflows/docker.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Docker - -on: - push: - branches: - - main - tags: - - v* - -jobs: - build-and-push: - name: Deploy Docker Image - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3.4.0 - - name: Build and push - uses: openzim/docker-publish-action@v10 - with: - image-name: openzim/kolibri - on-master: dev - tag-pattern: /^v([0-9.]+)$/ - latest-on-tag: true - restrict-to: openzim/kolibri - registries: ghcr.io - credentials: - GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} - GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} - repo_description: auto - repo_overview: auto diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..98c52fc --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,42 @@ +name: Build and upload to PyPI + +on: + release: + types: [published] + +jobs: + publish: + runs-on: ubuntu-22.04 + permissions: + id-token: write # mandatory for PyPI trusted publishing + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Build packages + run: | + pip install -U pip build + python -m build sdist wheel + + - name: Upload to PyPI + uses: pypa/gh-action-pypi-publish@release/v1.8 + + - name: Build and push Docker image + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/kolibri + tag-pattern: /^v([0-9.]+)$/ + latest-on-tag: true + restrict-to: openzim/kolibri + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml new file mode 100644 index 0000000..be2b9de --- /dev/null +++ b/.github/workflows/pull.yml @@ -0,0 +1,10 @@ +name: Pull Request + +on: + pull_request: + +jobs: + qa: + uses: ./.github/workflows/qa.yml + # run qa job if the pull request originates from a fork (otherwise the qa is already triggered by the push to a branch) + if: github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml new file mode 100644 index 0000000..d84f392 --- /dev/null +++ b/.github/workflows/push.yml @@ -0,0 +1,8 @@ +name: Push + +on: + push: + +jobs: + qa: + uses: ./.github/workflows/qa.yml diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml new file mode 100644 index 0000000..54c93eb --- /dev/null +++ b/.github/workflows/qa.yml @@ -0,0 +1,31 @@ +name: QA + +on: + workflow_call: + +jobs: + check-qa: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[lint,check,scripts,test] + + - name: Check black formatting + run: inv lint-black + + - name: Check ruff + run: inv lint-ruff + + - name: Check pyright + run: inv check-pyright diff --git a/.gitignore b/.gitignore index fb4df5f..db971bd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,178 @@ +# Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,database,visualstudiocode,intellij +# Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,database,visualstudiocode,intellij +### Database ### +*.accdb +*.db +*.dbf +*.mdb +*.pdb +*.sqlite3 +*.db-shm +*.db-wal + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -21,7 +195,6 @@ parts/ sdist/ var/ wheels/ -pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg @@ -51,6 +224,7 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ +cover/ # Translations *.mo @@ -73,6 +247,7 @@ instance/ docs/_build/ # PyBuilder +.pybuilder/ target/ # Jupyter Notebook @@ -83,7 +258,9 @@ profile_default/ ipython_config.py # pyenv -.python-version +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -92,7 +269,22 @@ ipython_config.py # install all needed dependencies. #Pipfile.lock -# PEP 582; used by e.g. github.com/David-OConnor/pyflow +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff @@ -129,17 +321,66 @@ dmypy.json # Pyre type checker .pyre/ -.DS_Store +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,database,visualstudiocode,intellij + # assets that we download -kolibri2zim/templates/assets/bootstrap/ -kolibri2zim/templates/assets/pdfjs/ -kolibri2zim/templates/assets/videojs/ -kolibri2zim/templates/assets/jquery.min.js -kolibri2zim/templates/assets/ogvjs/ -kolibri2zim/templates/assets/videojs-ogvjs.js .dockerignore -kolibri2zim/templates/assets/epub.min.js -kolibri2zim/templates/assets/bootstrap-icons/ -kolibri2zim/templates/assets/jszip.min.js -kolibri2zim/templates/assets/perseus/ +src/kolibri2zim/templates/assets/bootstrap/ +src/kolibri2zim/templates/assets/pdfjs/ +src/kolibri2zim/templates/assets/videojs/ +src/kolibri2zim/templates/assets/jquery.min.js +src/kolibri2zim/templates/assets/ogvjs/ +src/kolibri2zim/templates/assets/videojs-ogvjs.js +src/kolibri2zim/templates/assets/epub.min.js +src/kolibri2zim/templates/assets/bootstrap-icons/ +src/kolibri2zim/templates/assets/jszip.min.js +src/kolibri2zim/templates/assets/perseus/ + +# output dir +output + +# ignore all vscode, this is not standard configuration in this place +.vscode diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..577ac69 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/psf/black + rev: "23.3.0" + hooks: + - id: black +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.0.272 + hooks: + - id: ruff +- repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.315 + hooks: + - id: pyright + name: pyright (system) + description: 'pyright static type checker' + entry: pyright + language: system + 'types_or': [python, pyi] + require_serial: true + minimum_pre_commit_version: '2.9.2' diff --git a/CHANGELOG.md b/CHANGELOG.md index 6eb19cd..a5753f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,17 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## Fixed +### Added +- Add `--long-description` CLI parameter to set ZIM long description +- Add `--node-ids` CLI parameter to process only few channel nodes (_useful for debugging mostly_) + +### Fixed +- Fixed issue with ZIM description too long when sourced from channel metadata +- Fixed issue with ZIM icon sizes / formats - Fix issue with ePub rendering which was outside the iframe +- Description is now limited to expected lenght and long description is set +- Icons and illustrations are squared as expected +- Many small fixes (including some bugs) detected by ruff / pyright ### Changed -- Using zimscraperlib 3.1.0 -- Updated image to `python:3.11-bullseye` +- Migrate to our new Python standard (hatch, ruff, pyright, ...) +- Using zimscraperlib 3.1.1 +- Updated image to `python:3.11-bookworm` - Retry video reencoding up to three times - Move inline javascript to dedicated files - Move huge inline CSS to dedicated file -- Add `--node-ids` CLI parameter to process only few nodes (useful for debugging) ## [1.0.1] - 2023-02-22 @@ -30,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [1.0.0] - 2021-11-11 +### Added - initial version - supports topic/document/audio/video/html5/exercise content types - uses libzim7 diff --git a/Dockerfile b/Dockerfile index 274c715..743bd2b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,23 @@ -FROM python:3.11-bullseye -LABEL org.opencontainers.image.source https://github.com/openzim/kolibri2zim +FROM python:3.11-bookworm +LABEL org.opencontainers.image.source https://github.com/openzim/kolibri # Install necessary packages -RUN apt-get update -y \ - && apt-get install -y --no-install-recommends locales-all unzip ffmpeg \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + locales-all \ + unzip \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* \ + && python -m pip install --no-cache-dir -U \ + pip -COPY requirements.txt /src/ -RUN pip3 install --no-cache-dir -r /src/requirements.txt -COPY kolibri2zim /src/kolibri2zim -COPY setup.py *.md get_js_deps.sh MANIFEST.in /src/ -RUN cd /src/ && ./get_js_deps.sh && python3 ./setup.py install +# Copy code + associated artifacts +COPY src /src/src +COPY pyproject.toml *.md get_js_deps.sh MANIFEST.in LICENSE *.py /src/ + +# Install + cleanup +RUN pip install --no-cache-dir /src \ + && rm -rf /src # default output directory RUN mkdir -p /output diff --git a/dump_channel_to_fs.py b/dump_channel_to_fs.py index ad066f7..4c67395 100755 --- a/dump_channel_to_fs.py +++ b/dump_channel_to_fs.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu @@ -18,14 +17,14 @@ Uses wget for downloads """ +import contextlib +import logging +import multiprocessing as mp import os -import sys import pathlib -import logging import sqlite3 -import contextlib import subprocess -import multiprocessing as mp +import sys STUDIO_DEFAULT_BASE_URL = "https://studio.learningequality.org" STUDIO_URL = os.getenv("STUDIO_URL", STUDIO_DEFAULT_BASE_URL) @@ -36,7 +35,7 @@ logger = logging.getLogger("dump-remote") -def download_if_missing(url, fpath, fsize=None, force=False): +def download_if_missing(url, fpath, fsize=None, *, force=False): skipped = ( fpath.exists() and (fsize is not None and os.path.getsize(fpath) == fsize) @@ -90,12 +89,11 @@ def get_rows(db_path, query): cursor = conn.execute(query) rows = cursor.fetchmany() while rows: - for row in rows: - yield row + yield from rows rows = cursor.fetchmany() -def dump(channel_id, build_dir=None, force=False): +def dump(channel_id: str, build_dir: str | None, *, force: bool): build_path = pathlib.Path(build_dir or "build") logger.info(f"dumping {channel_id} into {build_path}") build_path.mkdir(exist_ok=True, parents=True) @@ -113,7 +111,7 @@ def dump(channel_id, build_dir=None, force=False): nb_files = get_single_value(db_path, "SELECT COUNT(*) FROM content_file") logger.info(f"Looping over all {nb_files} files") - def on_error(*args, **kwargs): + def on_error(*args, **kwargs): # noqa: ARG001 logger.error("Failed to download something") def on_success(result): @@ -147,7 +145,12 @@ def on_success(result): if __name__ == "__main__": - if len(sys.argv) < 2: - print("Missing channel ID") + args = [sys.argv[idx] if len(sys.argv) >= idx + 1 else None for idx in range(4)] + _, channel_id, build_dir, force = args + + if not channel_id: + logger.error("Missing channel ID") sys.exit(1) - dump(*sys.argv[1:]) + force = bool(str(force).lower() in ("true", "force", "yes")) + + dump(channel_id=channel_id, build_dir=build_dir, force=force) diff --git a/get_js_deps.sh b/get_js_deps.sh index ff10f77..8954e5b 100755 --- a/get_js_deps.sh +++ b/get_js_deps.sh @@ -1,5 +1,7 @@ #!/bin/sh +set -e + ### # download JS dependencies and place them in our templates/assets folder # then launch our ogv.js script to fix dynamic loading links @@ -21,7 +23,7 @@ fi # Absolute path this script is in. SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )" -ASSETS_PATH="${SCRIPT_PATH}/kolibri2zim/templates/assets" +ASSETS_PATH="${SCRIPT_PATH}/src/kolibri2zim/templates/assets" echo "About to download JS assets to ${ASSETS_PATH}" diff --git a/hatch_build.py b/hatch_build.py new file mode 100644 index 0000000..86dfa52 --- /dev/null +++ b/hatch_build.py @@ -0,0 +1,46 @@ +import logging +import subprocess +from pathlib import Path + +from hatchling.builders.hooks.plugin.interface import BuildHookInterface + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +# update list in constants.py as well +JS_DEPS = [ + "pdfjs", + "videojs", + "ogvjs", + "bootstrap", + "bootstrap-icons", + "perseus", + "epub.min.js", + "jszip.min.js", + "jquery.min.js", + "videojs-ogvjs.js", +] + + +class GetJsDepsHook(BuildHookInterface): + def initialize(self, version, build_data): + if self.deps_already_installed(): + logger.info("JS dependencies are already installed, skipping it") + return + Path(self.root).joinpath("src/kolibri2zim/templates/assets") + subprocess.run( + str(Path(self.root).joinpath("get_js_deps.sh")), + check=True, + ) + return super().initialize(version, build_data) + + def deps_already_installed(self) -> bool: + for dep in JS_DEPS: + if ( + not Path(self.root) + .joinpath("src/kolibri2zim/templates/assets") + .joinpath(dep) + .exists() + ): + return False + return True diff --git a/kolibri2zim/VERSION b/kolibri2zim/VERSION deleted file mode 100644 index 7dea76e..0000000 --- a/kolibri2zim/VERSION +++ /dev/null @@ -1 +0,0 @@ -1.0.1 diff --git a/kolibri2zim/templates/assets/perseus_exercise.js b/kolibri2zim/templates/assets/perseus_exercise.js deleted file mode 100644 index dfbc7a7..0000000 --- a/kolibri2zim/templates/assets/perseus_exercise.js +++ /dev/null @@ -1 +0,0 @@ -less = { env: 'development', logLevel: 1 }; \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..659a2d7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,226 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "kolibri2zim" +authors = [{ name = "Kiwix", email = "dev@kiwix.org" }] +keywords = ["kiwix", "zim", "offline", "kolibri"] +requires-python = ">=3.11" +description = "Make ZIM file from Kolibri Channels" +readme = "README.md" +license = { text = "GPL-3.0-or-later" } +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", +] +dependencies = [ + "zimscraperlib==3.1.1", + "kiwixstorage==0.8.3", + "Jinja2==3.1.2", + "pif==0.8.2", + "beautifulsoup4==4.9.3", + "retrying==1.3.4", +] +dynamic = ["version"] + +[project.optional-dependencies] +scripts = ["invoke==2.1.3"] +lint = ["black==23.3.0", "ruff==0.0.272"] +check = ["pyright==1.1.318"] +test = [ + "pytest==7.4.0", + "coverage==7.2.7", +] +dev = [ + "pre-commit==3.3.3", + "debugpy==1.6.7", + "kolibri2zim[scripts]", + "kolibri2zim[lint]", + "kolibri2zim[test]", + "kolibri2zim[check]", + # hatchling is a dev dependency only needed for hook development on developer machine + "hatchling==1.18.0", +] + +[project.urls] +Homepage = "https://github.com/openzim/kolibri" +Donate = "https://www.kiwix.org/en/support-us/" + +[project.scripts] +kolibri2zim = "kolibri2zim:entrypoint.main" + +[tool.hatch.version] +path = "src/kolibri2zim/__about__.py" + +[tool.hatch.build] +exclude = ["/.github"] + +[tool.hatch.build.hooks.custom] +path = "hatch_build.py" +dependencies = ["zimscraperlib==3.1.1"] + +[tool.hatch.envs.default] +features = ["dev"] + +[tool.hatch.envs.test] +features = ["scripts", "test"] + +[tool.hatch.envs.test.scripts] +run = "inv test --args '{args}'" +run-cov = "inv test-cov --args '{args}'" +report-cov = "inv report-cov" +coverage = "inv coverage --args '{args}'" + +[tool.hatch.envs.lint] +template = "lint" +python = "py311" +skip-install = false +features = ["scripts", "lint"] + +[tool.hatch.envs.lint.scripts] +black = "inv lint-black --args '{args}'" +ruff = "inv lint-ruff --args '{args}'" +all = "inv lintall --args '{args}'" +fix-black = "inv fix-black --args '{args}'" +fix-ruff = "inv fix-ruff --args '{args}'" +fixall = "inv fixall --args '{args}'" + +[tool.hatch.envs.check] +features = ["scripts", "check"] + +[tool.hatch.envs.check.scripts] +pyright = "inv check-pyright --args '{args}'" +all = "inv checkall --args '{args}'" + +[tool.black] +line-length = 88 +target-version = ['py311'] + +[tool.ruff] +target-version = "py311" +line-length = 88 +src = ["src"] +select = [ + "A", # flake8-builtins + # "ANN", # flake8-annotations + "ARG", # flake8-unused-arguments + # "ASYNC", # flake8-async + "B", # flake8-bugbear + # "BLE", # flake8-blind-except + "C4", # flake8-comprehensions + "C90", # mccabe + # "COM", # flake8-commas + # "D", # pydocstyle + # "DJ", # flake8-django + "DTZ", # flake8-datetimez + "E", # pycodestyle (default) + "EM", # flake8-errmsg + # "ERA", # eradicate + # "EXE", # flake8-executable + "F", # Pyflakes (default) + # "FA", # flake8-future-annotations + "FBT", # flake8-boolean-trap + # "FLY", # flynt + # "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + # "INP", # flake8-no-pep420 + # "INT", # flake8-gettext + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + # "NPY", # NumPy-specific rules + # "PD", # pandas-vet + # "PGH", # pygrep-hooks + # "PIE", # flake8-pie + # "PL", # Pylint + "PLC", # Pylint: Convention + "PLE", # Pylint: Error + "PLR", # Pylint: Refactor + "PLW", # Pylint: Warning + # "PT", # flake8-pytest-style + # "PTH", # flake8-use-pathlib + # "PYI", # flake8-pyi + "Q", # flake8-quotes + # "RET", # flake8-return + # "RSE", # flake8-raise + "RUF", # Ruff-specific rules + "S", # flake8-bandit + # "SIM", # flake8-simplify + # "SLF", # flake8-self + "T10", # flake8-debugger + "T20", # flake8-print + # "TCH", # flake8-type-checking + # "TD", # flake8-todos + "TID", # flake8-tidy-imports + # "TRY", # tryceratops + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow use of date.today + "DTZ011", + # Remove flake8-errmsg since we consider they bloat the code and provide limited value + "EM", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore warnings on subprocess.run / popen + "S603", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["kolibri2zim"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.pytest.ini_options] +minversion = "7.3" +testpaths = ["tests"] +pythonpath = [".", "src"] + +[tool.coverage.paths] +great_project = ["src/kolibri2zim"] +tests = ["tests"] + +[tool.coverage.run] +source_pkgs = ["kolibri2zim"] +branch = true +parallel = true +omit = [ + "src/kolibri2zim/__about__.py", +] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pyright] +include = ["src", "tests", "tasks.py"] +exclude = ["**/node_modules", + "**/__pycache__", + "src/kolibri2zim/templates", +] +extraPaths = ["src"] +pythonVersion = "3.11" +pythonPlatform = "All" +typeCheckingMode="basic" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index fe0d483..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -## Direct dependencies -zimscraperlib==3.1.0 -kiwixstorage==0.8.3 -Jinja2==3.1.2 -pif==0.8.2 -beautifulsoup4==4.9.3 -retrying==1.3.4 diff --git a/setup.py b/setup.py deleted file mode 100644 index c6b1eab..0000000 --- a/setup.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - -import pathlib -import subprocess -from setuptools import setup - -root_dir = pathlib.Path(__file__).parent - - -def read(*names, **kwargs): - with open(root_dir.joinpath(*names), "r") as fh: - return fh.read() - - -print("Downloading and fixing JS dependencies...") -subprocess.run([str(root_dir.joinpath("get_js_deps.sh").resolve())], check=True) - - -setup( - name="kolibri2zim", - version=read("kolibri2zim", "VERSION").strip(), - description="Make ZIM file from Kolibri Channels", - long_description=read("README.md"), - long_description_content_type="text/markdown", - author="satyamtg", - author_email="io.satyamtg@gmail.com", - url="https://github.com/openzim/kolibri2zim", - keywords="kiwix zim offline kolibri", - license="GPLv3+", - packages=["kolibri2zim"], - install_requires=[ - line.strip() - for line in read("requirements.txt").splitlines() - if not line.strip().startswith("#") - ], - zip_safe=False, - include_package_data=True, - entry_points={ - "console_scripts": [ - "kolibri2zim=kolibri2zim.__main__:main", - ] - }, - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Programming Language :: Python", - "Programming Language :: Python :: 3.8", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - ], - python_requires=">=3.6", -) diff --git a/src/kolibri2zim/__about__.py b/src/kolibri2zim/__about__.py new file mode 100644 index 0000000..f4c3a84 --- /dev/null +++ b/src/kolibri2zim/__about__.py @@ -0,0 +1 @@ +__version__ = "1.1.0-dev0" diff --git a/kolibri2zim/__init__.py b/src/kolibri2zim/__init__.py similarity index 100% rename from kolibri2zim/__init__.py rename to src/kolibri2zim/__init__.py diff --git a/kolibri2zim/__main__.py b/src/kolibri2zim/__main__.py similarity index 89% rename from kolibri2zim/__main__.py rename to src/kolibri2zim/__main__.py index 03b42a7..c595cdc 100644 --- a/kolibri2zim/__main__.py +++ b/src/kolibri2zim/__main__.py @@ -1,14 +1,13 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import sys import pathlib +import sys def main(): # allows running it from source using python kolibri2zim - sys.path = [str(pathlib.Path(__file__).parent.parent.resolve())] + sys.path + sys.path = [str(pathlib.Path(__file__).parent.parent.resolve()), *sys.path] from kolibri2zim.entrypoint import main as entry diff --git a/kolibri2zim/constants.py b/src/kolibri2zim/constants.py similarity index 72% rename from kolibri2zim/constants.py rename to src/kolibri2zim/constants.py index 578c8fc..53e0aa5 100644 --- a/kolibri2zim/constants.py +++ b/src/kolibri2zim/constants.py @@ -1,32 +1,46 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import os -import pathlib import logging import multiprocessing +import os +import pathlib from zimscraperlib.logging import getLogger as lib_getLogger +from kolibri2zim.__about__ import __version__ + ROOT_DIR = pathlib.Path(__file__).parent NAME = ROOT_DIR.name -with open(ROOT_DIR.joinpath("VERSION"), "r") as fh: - VERSION = fh.read().strip() +VERSION = __version__ SCRAPER = f"{NAME} {VERSION}" STUDIO_DEFAULT_BASE_URL = "https://studio.learningequality.org" STUDIO_URL = os.getenv("STUDIO_URL", STUDIO_DEFAULT_BASE_URL) +# when modifiying this list, update list in hatch_build.py as well +JS_DEPS: list[str] = [ + "pdfjs", + "videojs", + "ogvjs", + "bootstrap", + "bootstrap-icons", + "perseus", + "epub.min.js", + "jszip.min.js", + "jquery.min.js", + "videojs-ogvjs.js", +] + def is_running_inside_container(): fpath = pathlib.Path("/proc/self/cgroup") if not fpath.exists(): return False try: - with open(fpath, "r") as fh: + with open(fpath) as fh: for line in fh.readlines(): if line.strip().rsplit(":", 1)[-1] != "/": return True @@ -38,6 +52,7 @@ def is_running_inside_container(): class Global: debug = False inside_container = is_running_inside_container() + nb_available_cpus: int Global.nb_available_cpus = ( @@ -45,11 +60,11 @@ class Global: ) -def setDebug(debug): +def set_debug(debug): """toggle constants global DEBUG flag (used by getLogger)""" Global.debug = bool(debug) -def getLogger(): +def get_logger(): """configured logger respecting DEBUG flag""" return lib_getLogger(NAME, level=logging.DEBUG if Global.debug else logging.INFO) diff --git a/kolibri2zim/database.py b/src/kolibri2zim/database.py similarity index 88% rename from kolibri2zim/database.py rename to src/kolibri2zim/database.py index 3ce6b3c..7ecfa6e 100644 --- a/kolibri2zim/database.py +++ b/src/kolibri2zim/database.py @@ -1,9 +1,8 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import pathlib import logging +import pathlib import sqlite3 logger = logging.getLogger(__name__) @@ -23,7 +22,7 @@ class KolibriDB: Kolibri uses the Modified Preorder Tree Traversal model, from django-mptt https://gist.github.com/tmilos/f2f999b5839e2d42d751""" - def __init__(self, fpath: pathlib.Path, root_id: str = None): + def __init__(self, fpath: pathlib.Path, root_id: str | None = None): self.conn = sqlite3.connect( f"file:{fpath.expanduser().resolve()}?mode=ro", uri=True, @@ -74,8 +73,7 @@ def get_rows(self, query, *args, **kwargs): cursor = conn.execute(query, *args, **kwargs) rows = cursor.fetchmany() while rows: - for row in rows: - yield row + yield from rows rows = cursor.fetchmany() def get_channel_metadata(self, channel_id): @@ -95,8 +93,7 @@ def get_node_descendants(self, node_id, left=None, right=None): "ORDER BY level ASC", (left, right), ): - row = dict(row) - yield row + yield dict(row) def get_node_children(self, node_id, left=None, right=None): if left is None or right is None: @@ -111,17 +108,17 @@ def get_node_children(self, node_id, left=None, right=None): "ORDER BY level ASC", (left, right, node_id), ): - row = dict(row) - row.update( + rowdict = dict(row) + rowdict.update( { - "thumbnail": self.get_thumbnail_name(row["id"]), + "thumbnail": self.get_thumbnail_name(rowdict["id"]), } ) - yield row + yield rowdict def get_node_children_count(self, node_id, left=None, right=None): if left is None or right is None: - node = self.get_node(with_parents=False, with_children=False) + node = self.get_node(node_id, with_parents=False, with_children=False) left = node["left"] right = node["right"] @@ -133,7 +130,7 @@ def get_node_children_count(self, node_id, left=None, right=None): def get_node_parents(self, node_id, left=None, right=None): if left is None or right is None: - node = self.get_node(with_parents=False, with_children=False) + node = self.get_node(node_id, with_parents=False, with_children=False) left = node["left"] right = node["right"] @@ -148,7 +145,7 @@ def get_node_parents(self, node_id, left=None, right=None): def get_node_parents_count(self, node_id, left=None, right=None): if left is None or right is None: - node = self.get_node(with_parents=False, with_children=False) + node = self.get_node(node_id, with_parents=False, with_children=False) left = node["left"] right = node["right"] @@ -160,7 +157,7 @@ def get_node_parents_count(self, node_id, left=None, right=None): (left, right, self.root_left, self.root_right), ) - def get_node(self, node_id, with_parents=False, with_children=False): + def get_node(self, node_id, *, with_parents=False, with_children=False): node = self.get_row( "SELECT id, title, description, author, level, kind, " "license_name as license, license_owner, " @@ -196,13 +193,13 @@ def get_node(self, node_id, with_parents=False, with_children=False): ) return node - def get_node_file(self, node_id, thumbnail=False): + def get_node_file(self, node_id, *, thumbnail=False): try: - return next(self.get_node_files(node_id, thumbnail)) + return next(self.get_node_files(node_id, thumbnail=thumbnail)) except StopIteration: return None - def get_node_files(self, node_id, thumbnail=False): + def get_node_files(self, node_id, *, thumbnail=False): for row in self.get_rows( "SELECT id as fid, local_file_id as id, " "extension as ext, priority as prio, " diff --git a/kolibri2zim/debug.py b/src/kolibri2zim/debug.py similarity index 79% rename from kolibri2zim/debug.py rename to src/kolibri2zim/debug.py index b6fe2cd..0077113 100644 --- a/kolibri2zim/debug.py +++ b/src/kolibri2zim/debug.py @@ -1,11 +1,10 @@ import io import logging import pathlib -from typing import Optional, Tuple import requests from retrying import retry -from zimscraperlib.download import stream_file, _get_retry_adapter +from zimscraperlib.download import _get_retry_adapter, stream_file from zimscraperlib.video.encoding import reencode logging.basicConfig(level=logging.DEBUG) @@ -19,9 +18,11 @@ # retry up to 3 times, with delay from 40s @retry(stop_max_attempt_number=3, wait_exponential_multiplier=20000) -def get_size_and_mime(url: str) -> Tuple[int, str]: +def get_size_and_mime(url: str) -> tuple[int | None, str]: logger.debug(f"get_size_and_mime({url=})") - _, headers = stream_file(url, byte_stream=io.BytesIO(), only_first_block=True) + _, headers = stream_file( + url, byte_stream=io.BytesIO(), only_first_block=True + ) # type: ignore # see https://github.com/openzim/python-scraperlib/issues/104 mimetype = headers.get("Content-Type", "application/octet-stream") # Encoded data (compressed) prevents us from using Content-Length header # as source for the content (it represents length of compressed data) @@ -41,8 +42,8 @@ def get_size_and_mime(url: str) -> Tuple[int, str]: @retry(stop_max_attempt_number=5, wait_exponential_multiplier=20000) def download_to( url: str, - fpath: Optional[pathlib.Path] = None, - byte_stream: Optional[io.IOBase] = None, + fpath: pathlib.Path | None = None, + byte_stream: io.BytesIO | None = None, ): logger.debug(f"download_to({url=}) {'to-file' if fpath else 'to-mem'}") stream_file(url, fpath=fpath, byte_stream=byte_stream) diff --git a/kolibri2zim/entrypoint.py b/src/kolibri2zim/entrypoint.py similarity index 94% rename from kolibri2zim/entrypoint.py rename to src/kolibri2zim/entrypoint.py index 93f9f28..34bc34b 100755 --- a/kolibri2zim/entrypoint.py +++ b/src/kolibri2zim/entrypoint.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import sys import argparse +import sys -from .constants import NAME, SCRAPER, Global, getLogger, setDebug +from kolibri2zim.constants import NAME, SCRAPER, Global, get_logger, set_debug def main(): @@ -48,6 +47,11 @@ def main(): help="Custom description for your ZIM. Kolibri channel description otherwise", ) + parser.add_argument( + "--long-description", + help="Custom long description for your ZIM, optional", + ) + parser.add_argument( "--favicon", help="URL/path for Favicon. Kolibri channel thumbnail otherwise " @@ -190,10 +194,10 @@ def main(): ) args = parser.parse_args() - setDebug(args.debug) - logger = getLogger() + set_debug(args.debug) + logger = get_logger() - from .scraper import Kolibri2Zim + from kolibri2zim.scraper import Kolibri2Zim try: scraper = Kolibri2Zim(**dict(args._get_kwargs())) @@ -202,7 +206,7 @@ def main(): logger.error(f"FAILED. An error occurred: {exc}") if args.debug: logger.exception(exc) - raise SystemExit(1) + raise SystemExit(1) from exc if __name__ == "__main__": diff --git a/kolibri2zim/nodes.py b/src/kolibri2zim/nodes.py similarity index 100% rename from kolibri2zim/nodes.py rename to src/kolibri2zim/nodes.py diff --git a/kolibri2zim/processing.py b/src/kolibri2zim/processing.py similarity index 90% rename from kolibri2zim/processing.py rename to src/kolibri2zim/processing.py index 6de0f4f..6132850 100644 --- a/kolibri2zim/processing.py +++ b/src/kolibri2zim/processing.py @@ -1,13 +1,11 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu from zimscraperlib.video.encoding import reencode -from .constants import getLogger +from kolibri2zim.constants import get_logger - -logger = getLogger() +logger = get_logger() def post_process_video(video_dir, video_id, preset, video_format, low_quality): @@ -27,7 +25,8 @@ def post_process_video(video_dir, video_id, preset, video_format, low_quality): raise FileNotFoundError(f"Missing video file in {video_dir}") if len(files) > 1: logger.warning( - f"Multiple video file candidates for {video_id} in {video_dir}. Picking {files[0]} out of {files}" + f"Multiple video file candidates for {video_id} in {video_dir}. " + f"Picking {files[0]} out of {files}" ) src_path = files[0] diff --git a/kolibri2zim/scraper.py b/src/kolibri2zim/scraper.py similarity index 83% rename from kolibri2zim/scraper.py rename to src/kolibri2zim/scraper.py index 28d59c1..e812305 100644 --- a/kolibri2zim/scraper.py +++ b/src/kolibri2zim/scraper.py @@ -1,38 +1,47 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -import io -import shutil import base64 -import zipfile +import concurrent.futures as cf import datetime -import tempfile -import threading import hashlib +import io import json +import shutil +import tempfile +import threading +import zipfile from pathlib import Path -from typing import Optional -import concurrent.futures as cf import jinja2 from bs4 import BeautifulSoup -from pif import get_public_ip from kiwixstorage import KiwixStorage -from zimscraperlib.zim.creator import Creator -from zimscraperlib.zim.items import StaticItem +from pif import get_public_ip +from zimscraperlib.constants import ( + MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH, +) +from zimscraperlib.constants import ( + MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH, +) +from zimscraperlib.filesystem import get_file_mimetype from zimscraperlib.i18n import find_language_names -from zimscraperlib.inputs import handle_user_provided_file from zimscraperlib.image.convertion import convert_image, create_favicon from zimscraperlib.image.transformation import resize_image -from zimscraperlib.filesystem import get_file_mimetype -from zimscraperlib.video.presets import VideoWebmLow, VideoWebmHigh, VideoMp4Low +from zimscraperlib.inputs import handle_user_provided_file +from zimscraperlib.video.presets import VideoMp4Low, VideoWebmHigh, VideoWebmLow +from zimscraperlib.zim.creator import Creator +from zimscraperlib.zim.items import StaticItem -from .constants import ROOT_DIR, getLogger, STUDIO_URL -from .database import KolibriDB -from .debug import ON_DISK_THRESHOLD, download_to, get_size_and_mime, safer_reencode +from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, Global, get_logger +from kolibri2zim.database import KolibriDB +from kolibri2zim.debug import ( + ON_DISK_THRESHOLD, + download_to, + get_size_and_mime, + safer_reencode, +) -logger = getLogger() +logger = get_logger() options = [ "debug", "name", @@ -42,6 +51,7 @@ "fname", "title", "description", + "long_description", "creator", "publisher", "tags", @@ -58,7 +68,7 @@ "about", "css", "dedup_html_files", - "node_ids" + "node_ids", ] NOSTREAM_FUNNEL_SIZE = 1024 # 2**20 * 2 # 2MiB @@ -75,14 +85,12 @@ def get_kolibri_url_for(file_id: str, ext: str): return f"{STUDIO_URL}/content/storage/{remote_path}", fname -def read_from_zip(ark, member, as_text: Optional[bool] = True): - data = ark.open(member).read() - return data.decode("utf-8") if as_text else data +def read_from_zip(ark, member): + return ark.open(member).read() class Kolibri2Zim: def __init__(self, **kwargs): - for option in options: if option not in kwargs: raise ValueError(f"Missing parameter `{option}`") @@ -101,10 +109,14 @@ def go(option): # zim params self.fname = go("fname") self.tags = ( - [] if go("tags") is None else [t.strip() for t in go("tags").split(",")] + [] + if go("tags") is None + else [t.strip() for t in go("tags").split(",")] # pyright: ignore ) + self.title = go("title") self.description = go("description") + self.long_description = go("long_description") self.author = go("creator") self.publisher = go("publisher") self.name = go("name") @@ -115,14 +127,14 @@ def go(option): self.css = go("css") # directory setup - self.output_dir = Path(go("output_dir")).expanduser().resolve() + self.output_dir = Path(go("output_dir") or "/output").expanduser().resolve() if go("tmp_dir"): - Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True) + Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True) # pyright: ignore self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir"))) # performances options - self.nb_threads = go("threads") - self.nb_processes = go("processes") + self.nb_threads = int(go("threads") or 1) + self.nb_processes = int(go("processes") or Global.nb_available_cpus) self.s3_url_with_credentials = go("s3_url_with_credentials") self.s3_storage = None self.dedup_html_files = go("dedup_html_files") @@ -133,7 +145,9 @@ def go(option): self.debug = go("debug") self.only_topics = go("only_topics") self.node_ids = ( - None if go("node_ids") is None else [t.strip() for t in go("node_ids").split(",")] + None + if go("node_ids") is None + else [t.strip() for t in go("node_ids").split(",")] # pyright: ignore ) # jinja2 environment setup @@ -198,7 +212,12 @@ def funnel_file(self, fid, fext): url, fname = get_kolibri_url_for(fid, fext) size, mimetype = get_size_and_mime(url) - item_kw = dict(path=fname, title="", mimetype=mimetype, delete_fpath=True) + item_kw = { + "path": fname, + "title": "", + "mimetype": mimetype, + "delete_fpath": True, + } if not size or size >= ON_DISK_THRESHOLD: item_kw["fpath"] = Path( @@ -256,13 +275,12 @@ def funnel_from_s3(self, file_id, path, checksum, preset): # add to zim with self.creator_lock: - self.creator.add_item( - StaticItem( - path=path, - fileobj=fileobj, - mimetype=preset.mimetype, - ) - ) + kwargs = { + "path": path, + "fileobj": fileobj, + "mimetype": preset.mimetype, + } + self.creator.add_item(StaticItem(**kwargs)) logger.debug(f"Added {path} from S3::{key}") return True @@ -296,7 +314,10 @@ def add_topic_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, title=node["title"], content=html, mimetype="text/html" + path=node_id, + title=node["title"], + content=html, + mimetype="text/html", ) logger.debug(f"Added topic #{node_id}") @@ -313,7 +334,6 @@ def add_video_node(self, node_id): return files = sorted(files, key=lambda f: f["prio"]) it = filter(lambda f: f["supp"] == 0, files) - try: # find main video file video_file = next(it) @@ -347,7 +367,6 @@ def add_video_node(self, node_id): # funnel from S3 cache if it is present there if not self.funnel_from_s3(vfid, path, vchk, preset): - # download original video src = self.download_to_disk(vid, video_file["ext"]) dst = src.with_suffix(".webm") @@ -365,7 +384,6 @@ def add_video_node(self, node_id): # funnel from S3 cache if it is present there if not self.funnel_from_s3(vfid, path, vchk, preset): - # download original video src = self.download_to_disk(vid, video_file["ext"]) @@ -381,7 +399,9 @@ def add_video_node(self, node_id): # we want mp4, either in high-q or we have a low_res file to use else: - video_file = alt_video_file if self.low_quality else video_file + video_file = ( + alt_video_file if self.low_quality and alt_video_file else video_file + ) self.funnel_file(video_file["id"], video_file["ext"]) video_filename = filename_for(video_file) video_filename_ext = video_file["ext"] @@ -394,14 +414,14 @@ def add_video_node(self, node_id): local, english = find_language_names(file["lang"]) except Exception: english = file["lang"] - finally: - subtitles.append( - { - "code": file["lang"], - "name": english, - "filename": filename_for(file), - } - ) + + subtitles.append( + { + "code": file["lang"], + "name": english, + "filename": filename_for(file), + } + ) node = self.db.get_node(node_id, with_parents=True) html = self.jinja2_env.get_template("video.html").render( @@ -428,7 +448,10 @@ def add_video_upon_completion(self, future): logs error in case of failure""" if future.cancelled(): return - src_fname, dst_fpath, path = self.videos_futures.get(future) + try: + src_fname, dst_fpath, path = self.videos_futures[future] + except KeyError: + return try: future.result() @@ -487,7 +510,10 @@ def request_s3_upload_and_removal(self, item): """add file from item to uploads list""" path = item.path del item - dest_fpath, key, meta = self.pending_upload.get(path) + try: + dest_fpath, key, meta = self.pending_upload[path] + except KeyError: + return # TODO: submit to a thread executor (to create) instead # this is currently called on main-tread. self.upload_to_s3(key, dest_fpath, **meta) @@ -555,14 +581,14 @@ def add_exercise_node(self, node_id): for assessment_item in manifest.get("all_assessment_items", []): item_path = f"{assessment_item}.json" if item_path in zip_ark.namelist(): - perseus_content = read_from_zip(zip_ark, item_path) + perseus_content = read_from_zip(zip_ark, item_path).decode("utf-8") perseus_content = perseus_content.replace( r"web+graphie:${☣ LOCALPATH}", f"web+graphie:./{node_id}" ) perseus_content = perseus_content.replace( r"${☣ LOCALPATH}", f"./{node_id}" ) - assessment_items.append(perseus_content) + assessment_items.append(perseus_content) # add all support files to ZIM for ark_member in zip_ark.namelist(): @@ -574,12 +600,12 @@ def add_exercise_node(self, node_id): self.creator.add_item_for( path=path, title="", - content=read_from_zip(zip_ark, ark_member, as_text=False), + content=read_from_zip(zip_ark, ark_member), ) logger.debug(f"Added exercise support file {path}") # prepare and add exercise HTML article - node = self.db.get_node(node_id, with_parents=True) + node = self.db.get_node(node_id, with_parents=True, with_children=False) html = self.jinja2_env.get_template("perseus_exercise.html").render( node_id=node_id, perseus_content=f"[{', '.join(assessment_items)}]", @@ -588,7 +614,10 @@ def add_exercise_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, title=node["title"], content=html, mimetype="text/html" + path=node_id, + title=node["title"], + content=html, + mimetype="text/html", ) logger.debug(f"Added exercise node #{node_id}") @@ -702,7 +731,7 @@ def add_html5_node(self, node_id): # calculate hash of file and add entry if not in zim already content = zip_ark.open(ark_member).read() - content_hash = hashlib.md5(content).hexdigest() # nosec + content_hash = hashlib.md5(content).hexdigest() # nosec # noqa: S324 if content_hash not in self.html_files_cache: self.html_files_cache.append(content_hash) @@ -741,6 +770,8 @@ def run(self): f"{s3_msg}" ) + self.ensure_js_deps_are_present() + logger.info("Download database") self.download_db() @@ -763,20 +794,36 @@ def run(self): self.output_dir.mkdir(parents=True, exist_ok=True) self.creator_lock = threading.Lock() + if not self.root_id: + logger.error("Missing root id") + return 1 + if not self.title: + logger.error("Missing title") + return 1 + if not self.description: + logger.error("Missing description") + return 1 + if not self.author: + logger.error("Missing author") + return 1 + if not self.publisher: + logger.error("Missing publisher") + return 1 self.creator = Creator( - filename=self.output_dir.joinpath(self.fname), + filename=self.output_dir.joinpath(self.clean_fname), main_path=self.root_id, ignore_duplicates=True, ) self.creator.config_metadata( - Name=self.name, + Name=self.clean_fname, Language="eng", Title=self.title, Description=self.description, + LongDescription=self.long_description, Creator=self.author, Publisher=self.publisher, - Date=datetime.date.today().strftime("%Y-%d-%m"), - Illustration_48x48_at_1=self.favicon_fpath.read_bytes(), + Date=datetime.date.today(), + Illustration_48x48_at_1=self.favicon_48_fpath.read_bytes(), ) self.creator.start() @@ -824,7 +871,7 @@ def run(self): ) for future in result.done: if future.exception(): - raise future.exception() + raise future.exception() # pyright:ignore except KeyboardInterrupt: self.creator.can_finish = False logger.error("KeyboardInterrupt, exiting.") @@ -880,22 +927,39 @@ def sanitize_inputs(self): channel_meta = self.db.get_channel_metadata(self.channel_id) # input & metadata sanitation - period = datetime.datetime.now().strftime("%Y-%m") + period = datetime.date.today().strftime("%Y-%m") if self.fname: # make sure we were given a filename and not a path - self.fname = Path(self.fname.format(period=period)) - if Path(self.fname.name) != self.fname: + fname_path = Path(str(self.fname).format(period=period)) + if Path(fname_path.name) != fname_path: raise ValueError(f"filename is not a filename: {self.fname}") + self.clean_fname = str(fname_path) else: - self.fname = f"{self.name}_{period}.zim" + self.clean_fname = f"{self.name}_{period}.zim" if not self.title: self.title = channel_meta["name"] self.title = self.title.strip() + if self.description and len(self.description) > MAX_DESC_LENGTH: + raise ValueError( + f"Description too long ({len(self.description)}>{MAX_DESC_LENGTH})" + ) + if self.long_description and len(self.long_description) > MAX_LONG_DESC_LENGTH: + raise ValueError( + f"LongDescription too long ({len(self.long_description)}" + f">{MAX_LONG_DESC_LENGTH})" + ) + + kolibri_desc = channel_meta["description"].strip() + if not self.long_description and len(kolibri_desc) > MAX_DESC_LENGTH: + self.long_description = kolibri_desc[0:MAX_LONG_DESC_LENGTH] + if len(kolibri_desc) > MAX_LONG_DESC_LENGTH: + self.long_description = self.long_description[:-1] + "…" if not self.description: - self.description = channel_meta["description"] - self.description = self.description.strip() + self.description = kolibri_desc[0:MAX_DESC_LENGTH] + if len(kolibri_desc) > MAX_DESC_LENGTH: + self.description = self.description[:-1] + "…" if not self.author: self.author = channel_meta["author"] or "Kolibri" @@ -905,7 +969,7 @@ def sanitize_inputs(self): self.publisher = "Openzim" self.publisher = self.publisher.strip() - self.tags = list(set(self.tags + ["_category:other", "kolibri", "_videos:yes"])) + self.tags = list({*self.tags, "_category:other", "kolibri", "_videos:yes"}) def retrieve_favicon(self): favicon_orig = self.build_dir / "favicon" @@ -934,27 +998,23 @@ def retrieve_favicon(self): ) # convert to PNG (might already be PNG but it's OK) - favicon_fpath = favicon_orig.with_suffix(".png") - convert_image(favicon_orig, favicon_fpath) + self.favicon_48_fpath = favicon_orig.with_suffix(".48.png") + convert_image(favicon_orig, self.favicon_48_fpath) - # resize to appropriate size (ZIM uses 48x48 so we double for retina) - for size in (96, 48): - resize_image(favicon_fpath, width=size, height=size, method="thumbnail") - with open(favicon_fpath, "rb") as fh: - self.creator.add_illustration(size, fh.read()) + self.favicon_96_fpath = favicon_orig.with_suffix(".96.png") + convert_image(favicon_orig, self.favicon_96_fpath) - # resize to appropriate size (ZIM uses 48x48) - resize_image(favicon_fpath, width=96, height=96, method="thumbnail") + # resize to appropriate size (ZIM uses 48x48 so we double for retina) + resize_image(self.favicon_48_fpath, width=48, height=48, method="contain") + resize_image(self.favicon_96_fpath, width=96, height=96, method="contain") # generate favicon - favicon_ico_path = favicon_fpath.with_suffix(".ico") - create_favicon(src=favicon_fpath, dst=favicon_ico_path) - - self.favicon_fpath = favicon_fpath - self.favicon_ico_path = favicon_ico_path + self.favicon_ico_path = favicon_orig.with_suffix(".ico") + create_favicon(src=self.favicon_96_fpath, dst=self.favicon_ico_path) def add_favicon(self): - self.creator.add_item_for("favicon.png", fpath=self.favicon_fpath) + self.creator.add_illustration(96, self.favicon_96_fpath.read_bytes()) + self.creator.add_item_for("favicon.png", fpath=self.favicon_96_fpath) self.creator.add_item_for("favicon.ico", fpath=self.favicon_ico_path) def add_custom_about_and_css(self): @@ -962,14 +1022,18 @@ def add_custom_about_and_css(self): if self.about: # if user provided a custom about page, use it - with open( - handle_user_provided_file( - source=self.about, in_dir=self.build_dir, nocopy=True - ), - "r", - ) as fh: - soup = BeautifulSoup(fh.read(), "lxml") - title = soup.find("title").text + user_provided_file = handle_user_provided_file( + source=self.about, in_dir=self.build_dir, nocopy=True + ) + if not user_provided_file: + title = channel_meta["name"] + content = None + else: + soup = BeautifulSoup(user_provided_file.read_bytes(), "lxml") + title = soup.find("title") + if not title: + raise Exception("Failed to extract title") + title = title.text content = soup.select("body > .container") # we're only interested in the first one if isinstance(content, list): @@ -992,16 +1056,24 @@ def add_custom_about_and_css(self): # if user provided a custom CSS file, use it if self.css: - with open( - handle_user_provided_file( - source=self.css, in_dir=self.build_dir, nocopy=True - ), - "r", - ) as fh: - content = fh.read() + user_provided_file = handle_user_provided_file( + source=self.css, in_dir=self.build_dir, nocopy=True + ) + if not user_provided_file: + content = "" + else: + content = user_provided_file.read_bytes() # otherwise, create a blank one else: content = "" self.creator.add_item_for("custom.css", content=content, mimetype="text/css") logger.debug("Added about page and custom CSS") + + def ensure_js_deps_are_present(self): + for dep in JS_DEPS: + if not self.templates_dir.joinpath(f"assets/{dep}").exists(): + raise ValueError( + "It looks like JS deps have not been installed," + f" {dep} is missing" + ) diff --git a/kolibri2zim/templates/about.html b/src/kolibri2zim/templates/about.html similarity index 100% rename from kolibri2zim/templates/about.html rename to src/kolibri2zim/templates/about.html diff --git a/kolibri2zim/templates/assets/document.js b/src/kolibri2zim/templates/assets/document.js similarity index 94% rename from kolibri2zim/templates/assets/document.js rename to src/kolibri2zim/templates/assets/document.js index a0826a0..b085bf6 100644 --- a/kolibri2zim/templates/assets/document.js +++ b/src/kolibri2zim/templates/assets/document.js @@ -6,4 +6,4 @@ function resizeFrameToFullHeight(){ frame.style.height = newHeight + 'px'; } window.addEventListener('resize', resizeFrameToFullHeight, {capture: true}); -resizeFrameToFullHeight(); \ No newline at end of file +resizeFrameToFullHeight(); diff --git a/kolibri2zim/templates/assets/epub_embed.css b/src/kolibri2zim/templates/assets/epub_embed.css similarity index 99% rename from kolibri2zim/templates/assets/epub_embed.css rename to src/kolibri2zim/templates/assets/epub_embed.css index 25aed6d..2a5e859 100644 --- a/kolibri2zim/templates/assets/epub_embed.css +++ b/src/kolibri2zim/templates/assets/epub_embed.css @@ -280,4 +280,4 @@ svg { #opener:hover { stroke: #777; fill: #777; -} \ No newline at end of file +} diff --git a/kolibri2zim/templates/assets/epub_embed.html b/src/kolibri2zim/templates/assets/epub_embed.html similarity index 100% rename from kolibri2zim/templates/assets/epub_embed.html rename to src/kolibri2zim/templates/assets/epub_embed.html diff --git a/kolibri2zim/templates/assets/epub_embed.js b/src/kolibri2zim/templates/assets/epub_embed.js similarity index 94% rename from kolibri2zim/templates/assets/epub_embed.js rename to src/kolibri2zim/templates/assets/epub_embed.js index 6e6c0fc..fd269c5 100644 --- a/kolibri2zim/templates/assets/epub_embed.js +++ b/src/kolibri2zim/templates/assets/epub_embed.js @@ -48,4 +48,4 @@ var params = URLSearchParams && new URLSearchParams(document.location.search.sub }; rendition.on("keyup", keyListener); - document.addEventListener("keyup", keyListener, false); \ No newline at end of file + document.addEventListener("keyup", keyListener, false); diff --git a/src/kolibri2zim/templates/assets/perseus_exercise.js b/src/kolibri2zim/templates/assets/perseus_exercise.js new file mode 100644 index 0000000..bce3f89 --- /dev/null +++ b/src/kolibri2zim/templates/assets/perseus_exercise.js @@ -0,0 +1 @@ +less = { env: 'development', logLevel: 1 }; diff --git a/kolibri2zim/templates/audio.html b/src/kolibri2zim/templates/audio.html similarity index 100% rename from kolibri2zim/templates/audio.html rename to src/kolibri2zim/templates/audio.html diff --git a/kolibri2zim/templates/base.html b/src/kolibri2zim/templates/base.html similarity index 100% rename from kolibri2zim/templates/base.html rename to src/kolibri2zim/templates/base.html diff --git a/kolibri2zim/templates/card.html b/src/kolibri2zim/templates/card.html similarity index 100% rename from kolibri2zim/templates/card.html rename to src/kolibri2zim/templates/card.html diff --git a/kolibri2zim/templates/document.html b/src/kolibri2zim/templates/document.html similarity index 99% rename from kolibri2zim/templates/document.html rename to src/kolibri2zim/templates/document.html index eec2b56..18dba86 100644 --- a/kolibri2zim/templates/document.html +++ b/src/kolibri2zim/templates/document.html @@ -49,7 +49,7 @@ {% endblock %} diff --git a/kolibri2zim/templates/epub.html b/src/kolibri2zim/templates/epub.html similarity index 99% rename from kolibri2zim/templates/epub.html rename to src/kolibri2zim/templates/epub.html index 306ca09..00f949c 100644 --- a/kolibri2zim/templates/epub.html +++ b/src/kolibri2zim/templates/epub.html @@ -8,5 +8,3 @@
You should get an epub reader here someday. In the mean time, just Open EPUB directly