diff --git a/.github/workflows/Publish.yaml b/.github/workflows/Publish.yaml new file mode 100644 index 0000000..a347c51 --- /dev/null +++ b/.github/workflows/Publish.yaml @@ -0,0 +1,42 @@ +name: Publish released version + +on: + release: + types: [published] + +jobs: + publish: + runs-on: ubuntu-22.04 + permissions: + id-token: write # mandatory for PyPI trusted publishing + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Build packages + run: | + pip install -U pip build + python -m build --sdist --wheel + + - name: Upload to PyPI + uses: pypa/gh-action-pypi-publish@release/v1.8 + + - name: Build and push Docker image + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/ifixit + tag-pattern: /^v([0-9.]+)$/ + latest-on-tag: true + restrict-to: openzim/ifixit + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml new file mode 100644 index 0000000..7292a6a --- /dev/null +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -0,0 +1,27 @@ +name: Publish Docker dev image + +on: + push: + branches: + - main + +jobs: + publish: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Build and push Docker image + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/ifixit + manual-tag: dev + latest-on-tag: false + restrict-to: openzim/ifixit + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto diff --git a/.github/workflows/QA.yaml b/.github/workflows/QA.yaml new file mode 100644 index 0000000..48ccee5 --- /dev/null +++ b/.github/workflows/QA.yaml @@ -0,0 +1,34 @@ +name: QA + +on: + pull_request: + push: + branches: + - main + +jobs: + check-qa: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[lint,scripts,test,check] + + - name: Check black formatting + run: inv lint-black + + - name: Check ruff + run: inv lint-ruff + + - name: Check pyright + run: inv check-pyright diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml new file mode 100644 index 0000000..838269f --- /dev/null +++ b/.github/workflows/Tests.yaml @@ -0,0 +1,62 @@ +name: Tests + +on: + pull_request: + push: + branches: + - main + +jobs: + run-tests: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[test,scripts] + + - name: Run the tests + run: inv coverage --args "-vvv" + + - name: Upload coverage report to codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + + build_python: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Ensure we can build Python targets + run: | + pip install -U pip build + python3 -m build --sdist --wheel + + build_docker: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Ensure we can build the Docker image + run: | + docker build -t testimage . + + - name: Ensure we can start the Docker image + run: | + docker run --rm testimage diff --git a/.github/workflows/qa.yml b/.github/workflows/qa.yml deleted file mode 100644 index 35aa095..0000000 --- a/.github/workflows/qa.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: QA - -on: [push, pull_request] - -env: - # black default - MAX_LINE_LENGTH: 88 - -jobs: - check-qa: - runs-on: ubuntu-22.04 - steps: - - name: Retrieve source code - uses: actions/checkout@v3 - - - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v1 - with: - python-version: 3.8 - architecture: x64 - - - name: Check black formatting - run: | - pip install black==22.3.0 - black --version - black --check . - - - name: Check flake8 linting - run: | - pip install flake8==3.9.2 - flake8 --version - flake8 . --count --max-line-length=$MAX_LINE_LENGTH --statistics - - - name: Check import order with isort - run: | - pip install isort==5.9.3 - isort --version - isort --profile black --check . diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index d09640e..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: release -on: - release: - types: [published] - tags: - - v* - -env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - -jobs: - release: - environment: release - runs-on: ubuntu-22.04 - - steps: - - name: Retrieve source code - uses: actions/checkout@v3 - - - name: Set up Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: 3.8 - architecture: x64 - - - name: Build sdist and wheel - run: | - pip install --upgrade setuptools pip wheel - python3 setup.py sdist - python3 setup.py bdist_wheel - - - name: Push release to PyPI - if: github.event_name == 'release' - run: | - pip install --upgrade twine - twine check dist/* - twine upload dist/* diff --git a/.gitignore b/.gitignore index 7b75bdd..a68de0c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,213 @@ -**/*.zim -ifixit2zim/tests -output -dist -cache -venv +# Created by https://www.toptal.com/developers/gitignore/api/python,macos +# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env .venv -saved_pages -**/__pycache__ +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json +# End of https://www.toptal.com/developers/gitignore/api/python,macos +# ignore all vscode, this is not standard configuration in this place +.vscode +output \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..4380fad --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/psf/black + rev: "24.2.0" + hooks: + - id: black +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: "0.3.0" + hooks: + - id: ruff +- repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.352 + hooks: + - id: pyright + name: pyright (system) + description: 'pyright static type checker' + entry: pyright + language: system + 'types_or': [python, pyi] + require_serial: true + minimum_pre_commit_version: '2.9.2' diff --git a/.vscode/settings.json b/.vscode/settings.json index c4af4f9..9d9b8bd 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,8 +1,11 @@ { - "files.exclude": { - "cache": true, - "dist": true, - "venv": true + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + }, }, - "editor.rulers": [88] + "python.analysis.typeCheckingMode": "basic", + "editor.rulers": [88], } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index b91633f..699edf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,19 +1,38 @@ -# 0.2.4 +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Fixed + +- iFixit API is returning "null" when listing category (#93) + +## [0.2.4] - 2023-01-05 + +### Fixed - Adapt to changes in upstream iFixit main page HTML content -# 0.2.3 +## [0.2.3] - 2022-10-20 + +### Fixed - Do not process unrecognized href, i.e. pointing outside iFixit -# 0.2.2 +## [0.2.2] - 2022-10-04 + +### Fixed - Fixed URL normalization on articles redirecting outside domain (help.ifixit.com) -# 0.2.1 -See [milestone](https://github.com/openzim/ifixit/milestone/3) for advanced details. +## [0.2.1] - 2022-06-02 + +### Fixed -Small bugs fixes: - Report more clearly in the log when no ZIM is produced on-purpose + produce the ZIM even if some error occured - Remove unused log about number of images scrapped - Fix issue with unquoted normalized URLs before regex matching @@ -24,16 +43,15 @@ Small bugs fixes: - URLs of missing items are not encoded properly - Issues with the "Load more comments" button in guides -# 0.2.0 -See [milestone](https://github.com/openzim/ifixit/milestone/1) for advanced details. +## [0.2.0] - 2022-05-04 + +### Added - Render tools and parts on guides / categories - Render comments on guides - Scrape user pages (only the ones linked as an author or in a comment) - Use a nice looking URL scheme (instead of the previous technical one) - Report about scraper progression (usefull for ZimFarm monitoring) -- Fix issue about items being scrapped twice due to int / str difference -- Fix issue about ANCHOR links - Add a nice page for missing / error items to avoid dead links - Add a nice looking page for external URLs - Handle URL-encoded category titles found in links @@ -42,7 +60,12 @@ See [milestone](https://github.com/openzim/ifixit/milestone/1) for advanced deta - Detect duplicate images and replace them with a redirect - Documentation for PyPi installation +### Fixed +- Fix issue about items being scrapped twice due to int / str difference +- Fix issue about ANCHOR links + +## [0.1.0] - 2022-04-17 -# 0.1.0 +### Added - initial version diff --git a/Dockerfile b/Dockerfile index 841e5a6..428a031 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,34 @@ -FROM python:3.8-slim +FROM python:3.12-slim-bookworm LABEL org.opencontainers.image.source https://github.com/openzim/ifixit # Install necessary packages # TODO: do we really need all these packages? -RUN apt-get update -y \ - && apt-get install -y --no-install-recommends locales libmagic1 wget ffmpeg \ - libtiff5-dev libjpeg-dev libopenjp2-7-dev zlib1g-dev \ - libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python3-tk \ - libharfbuzz-dev libfribidi-dev libxcb1-dev gifsicle curl unzip \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + locales \ + locales-all \ + libmagic1 \ + wget \ + ffmpeg \ + libtiff5-dev \ + libjpeg-dev \ + libopenjp2-7-dev \ + zlib1g-dev \ + libfreetype6-dev \ + liblcms2-dev \ + libwebp-dev \ + tcl8.6-dev \ + tk8.6-dev \ + python3-tk \ + libharfbuzz-dev \ + libfribidi-dev \ + libxcb1-dev \ + gifsicle \ + curl \ + unzip \ + && rm -rf /var/lib/apt/lists/* \ + && python -m pip install --no-cache-dir -U \ + pip # setup timezone and locale ENV TZ "UTC" @@ -21,13 +40,20 @@ ENV LANG en_US.UTF-8 ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 -COPY requirements.pip /src/ -RUN pip3 install --no-cache-dir -r /src/requirements.pip -COPY ifixit2zim /src/ifixit2zim -COPY setup.py *.md MANIFEST.in /src/ -RUN cd /src/ \ - && python3 ./setup.py install \ - && rm -r /src \ - && mkdir -p /output +# Copy pyproject.toml and its dependencies +COPY pyproject.toml README.md /src/ +COPY src/ifixit2zim/__about__.py /src/src/ifixit2zim/__about__.py + +# Install Python dependencies +RUN pip install --no-cache-dir /src + +# Copy code + associated artifacts +COPY src /src/src +COPY *.md /src/ + +# Install + cleanup +RUN pip install --no-cache-dir /src \ + && rm -rf /src \ + && mkdir -p /output CMD ["ifixit2zim", "--help"] diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index f9be6c2..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include *.md -include requirements.pip -recursive-include ifixit2zim * diff --git a/README.md b/README.md index 9a2eab6..029f93d 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Docker container as explained below. First, build the Docker image (to be ran in the main folder of this repo): ``` -docker build -t ghcr.io/openzim/ifixit:local . +docker build -t local-ifixit . ``` Then run the scraper with CLI arguments needed for your test (everything after `ifixit2zim` in the example below). @@ -85,7 +85,7 @@ Then run the scraper with CLI arguments needed for your test (everything after ` For instance, if you want to run a scrape of only the `Apple_PDA` category, including its guides, in French : ``` -docker run -it -v $(pwd)/output:/output --rm ghcr.io/openzim/fixit:local ifixit2zim --language fr --output /output --tmp-dir /tmp --category Apple_PDA +docker run -it -v $(pwd)/output:/output --rm local-ifixit ifixit2zim --language fr --output /output --tmp-dir /tmp --category Apple_PDA ``` This will produce a ZIM in the output folder of your current directory. diff --git a/ifixit2zim/VERSION b/ifixit2zim/VERSION deleted file mode 100644 index 72f9fa8..0000000 --- a/ifixit2zim/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.2.4 \ No newline at end of file diff --git a/ifixit2zim/__main__.py b/ifixit2zim/__main__.py deleted file mode 100644 index 104fa17..0000000 --- a/ifixit2zim/__main__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- - -import pathlib -import sys - - -def main(): - # allows running it from source using python ifixit2zim - sys.path = [str(pathlib.Path(__file__).parent.parent.resolve())] + sys.path - - from ifixit2zim.entrypoint import main as entry - - entry() - - -if __name__ == "__main__": - main() diff --git a/ifixit2zim/exceptions.py b/ifixit2zim/exceptions.py deleted file mode 100644 index 136b084..0000000 --- a/ifixit2zim/exceptions.py +++ /dev/null @@ -1,10 +0,0 @@ -class FinalScrapingFailure(Exception): - pass - - -class UnexpectedDataKindException(Exception): - pass - - -class CategoryHomePageContentError(Exception): - pass diff --git a/ifixit2zim/scraper.py b/ifixit2zim/scraper.py deleted file mode 100644 index 647b12c..0000000 --- a/ifixit2zim/scraper.py +++ /dev/null @@ -1,313 +0,0 @@ -# -*- coding: utf-8 -*- - -import json -import pathlib -import shutil -from datetime import datetime - -from schedule import every -from zimscraperlib.image.transformation import resize_image - -from .constants import ROOT_DIR, Conf -from .scraper_category import ScraperCategory -from .scraper_guide import ScraperGuide -from .scraper_homepage import ScraperHomepage -from .scraper_info import ScraperInfo -from .scraper_user import ScraperUser -from .shared import Global, GlobalMixin, logger -from .utils import setup_s3_and_check_credentials - - -class ifixit2zim(GlobalMixin): - def __init__(self, **kwargs): - - Global.conf = Conf(**kwargs) - for option in Global.conf.required: - if getattr(Global.conf, option) is None: - raise ValueError(f"Missing parameter `{option}`") - - self.scraper_homepage = ScraperHomepage() - self.scraper_guide = ScraperGuide() - self.scraper_category = ScraperCategory() - self.scraper_info = ScraperInfo() - self.scraper_user = ScraperUser() - self.scrapers = [ - self.scraper_homepage, - self.scraper_category, - self.scraper_guide, - self.scraper_info, - self.scraper_user, - ] - - @property - def build_dir(self): - return self.conf.build_dir - - def cleanup(self): - """Remove temp files and release resources before exiting""" - if not self.conf.keep_build_dir: - logger.debug(f"Removing {self.build_dir}") - shutil.rmtree(self.build_dir, ignore_errors=True) - - def sanitize_inputs(self): - """input & metadata sanitation""" - logger.debug("Checking user-provided metadata") - - if not self.conf.name: - is_selection = ( - self.conf.categories - or self.conf.guides - or self.conf.infos - or self.conf.no_category - or self.conf.no_guide - or self.conf.no_info - ) - self.conf.name = "ifixit_{lang}_{selection}".format( - lang=self.conf.language["iso-639-1"], - selection="selection" if is_selection else "all", - ) - - period = datetime.now().strftime("%Y-%m") - if self.conf.fname: - # make sure we were given a filename and not a path - self.conf.fname = pathlib.Path(self.conf.fname.format(period=period)) - if pathlib.Path(self.conf.fname.name) != self.conf.fname: - raise ValueError(f"filename is not a filename: {self.conf.fname}") - else: - self.conf.fname = f"{self.conf.name}_{period}.zim" - - if not self.conf.title: - self.conf.title = self.metadata["title"] - self.conf.title = self.conf.title.strip() - - if not self.conf.description: - self.conf.description = self.metadata["description"] - self.conf.description = self.conf.description.strip() - - if not self.conf.author: - self.conf.author = "iFixit" - self.conf.author = self.conf.author.strip() - - if not self.conf.publisher: - self.conf.publisher = "openZIM" - self.conf.publisher = self.conf.publisher.strip() - - self.conf.tags = list( - set( - self.conf.tag - + ["_category:iFixit", "iFixit", "_videos:yes", "_pictures:yes"] - ) - ) - - logger.debug( - "Configuration after sanitization:\n" - f"name: {self.conf.name}\n" - f"fname: {self.conf.fname}\n" - f"name: {self.conf.author}\n" - f"fname: {self.conf.publisher}" - ) - - def add_assets(self): - """download and add site-wide assets, identified in metadata step""" - logger.info("Adding assets") - - # recursively add our assets, at a path identical to position in repo - assets_root = pathlib.Path(ROOT_DIR.joinpath("assets")) - for fpath in assets_root.glob("**/*"): - if not fpath.is_file(): - continue - path = str(fpath.relative_to(ROOT_DIR)) - - logger.debug(f"> {path}") - with self.lock: - self.creator.add_item_for(path=path, fpath=fpath) - - def add_illustrations(self): - logger.info("Adding illustrations") - - src_illus_fpath = pathlib.Path(ROOT_DIR.joinpath("assets", "illustration.png")) - tmp_illus_fpath = pathlib.Path(self.build_dir, "illustration.png") - - shutil.copy(src_illus_fpath, tmp_illus_fpath) - - # resize to appropriate size (ZIM uses 48x48 so we double for retina) - for size in (96, 48): - resize_image(tmp_illus_fpath, width=size, height=size, method="thumbnail") - with open(tmp_illus_fpath, "rb") as fh: - with self.lock: - self.creator.add_illustration(size, fh.read()) - - def run(self): - # first report => creates a file with appropriate structure - self.report_progress() - - s3_storage = ( - setup_s3_and_check_credentials(self.conf.s3_url_with_credentials) - if self.conf.s3_url_with_credentials - else None - ) - s3_msg = ( - f"\n" - f" using cache: {s3_storage.url.netloc} " - f"with bucket: {s3_storage.bucket_name}" - if s3_storage - else "" - ) - del s3_storage - - logger.info( - f"Starting scraper with:\n" - f" language: {self.conf.language['english']}" - f" ({self.conf.domain})\n" - f" output_dir: {self.conf.output_dir}\n" - f" build_dir: {self.build_dir}\n" - f"{s3_msg}" - ) - - Global.metadata = self.scraper_homepage.get_online_metadata() - logger.debug( - f"Additional metadata scrapped online:\n" - f"title: {self.metadata['title']}\n" - f"description: {self.metadata['description']}\n" - f"stats: {self.metadata['stats']}\n" - ) - self.sanitize_inputs() - - logger.debug("Starting Zim creation") - Global.setup() - Global.env.filters[ - "get_category_link_from_obj" - ] = self.scraper_category.get_category_link_from_obj - Global.env.filters[ - "get_category_link_from_props" - ] = self.scraper_category.get_category_link_from_props - Global.env.filters[ - "get_guide_link_from_obj" - ] = self.scraper_guide.get_guide_link_from_obj - Global.env.filters[ - "get_guide_link_from_props" - ] = self.scraper_guide.get_guide_link_from_props - Global.env.filters[ - "get_info_link_from_obj" - ] = self.scraper_info.get_info_link_from_obj - Global.env.filters[ - "get_info_link_from_props" - ] = self.scraper_info.get_info_link_from_props - Global.env.filters[ - "get_user_link_from_obj" - ] = self.scraper_user.get_user_link_from_obj - Global.env.filters[ - "get_user_link_from_props" - ] = self.scraper_user.get_user_link_from_props - Global.get_category_link_from_props = ( - self.scraper_category.get_category_link_from_props - ) - Global.get_guide_link_from_props = self.scraper_guide.get_guide_link_from_props - Global.get_info_link_from_props = self.scraper_info.get_info_link_from_props - Global.get_user_link_from_props = self.scraper_user.get_user_link_from_props - for scraper in self.scrapers: - scraper.setup() - self.creator.start() - - try: - - self.add_assets() - self.add_illustrations() - - for scraper in self.scrapers: - scraper.build_expected_items() - self.report_progress() - - # set a timer to report progress only every 10 seconds, not need to do it - # after every item scrapped - every(10).seconds.do(self.report_progress) - - while True: - for scraper in self.scrapers: - scraper.scrape_items() - needs_rerun = False - if not Global.conf.scrape_only_first_items: - for scraper in self.scrapers: - if not scraper.items_queue.empty(): - needs_rerun = True - if not needs_rerun: - break - - logger.info("Awaiting images") - Global.img_executor.shutdown() - - self.report_progress() - - stats = "Stats: " - for scraper in self.scrapers: - stats += ( - f"{len(scraper.expected_items_keys)} {scraper.get_items_name()}, " - ) - for scraper in self.scrapers: - stats += ( - f"{len(scraper.missing_items_keys)} missing" - f" {scraper.get_items_name()}, " - ) - for scraper in self.scrapers: - stats += ( - f"{len(scraper.error_items_keys)} {scraper.get_items_name()}" - " in error, " - ) - stats += f"{len(self.imager.handled)} images" - - logger.info(stats) - - logger.info("Null categories:") - for key in Global.null_categories: - logger.info(f"\t{key}") - - logger.info("IFIXIT_EXTERNAL URLS:") - for exturl in sorted(Global.ifixit_external_content): - logger.info(f"\t{exturl}") - - except Exception as exc: - # request Creator not to create a ZIM file on finish - self.creator.can_finish = False - if isinstance(exc, KeyboardInterrupt): - logger.error("KeyboardInterrupt, exiting.") - else: - logger.error(f"Interrupting process due to error: {exc}") - logger.exception(exc) - self.imager.abort() - Global.img_executor.shutdown(wait=False) - return 1 - else: - if self.creator.can_finish: - logger.info("Finishing ZIM file") - with self.lock: - self.creator.finish() - logger.info( - f"Finished Zim {self.creator.filename.name} " - f"in {self.creator.filename.parent}" - ) - finally: - logger.info("Cleaning up") - with self.lock: - self.cleanup() - - logger.info("Scraper has finished normally") - - def report_progress(self): - if not Global.conf.stats_filename: - return - done = 0 - total = 0 - for scraper in self.scrapers: - scraper_total = len(scraper.expected_items_keys) + len( - scraper.unexpected_items_keys - ) - scraper_remains = scraper.items_queue.qsize() - scraper_done = scraper_total - scraper_remains - total += scraper_total - done += scraper_done - progress = { - "done": done, - "total": total, - } - with open(Global.conf.stats_filename, "w") as outfile: - json.dump(progress, outfile, indent=2) diff --git a/ifixit2zim/shared.py b/ifixit2zim/shared.py deleted file mode 100644 index a70b327..0000000 --- a/ifixit2zim/shared.py +++ /dev/null @@ -1,524 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu -# pylint: disable=cyclic-import - -import locale -import logging -import re -import threading -import urllib -from contextlib import contextmanager -from datetime import date, datetime - -import requests -from jinja2 import Environment, FileSystemLoader, select_autoescape -from zimscraperlib.logging import getLogger as lib_getLogger -from zimscraperlib.zim.creator import Creator - -from .constants import ( - DEFAULT_DEVICE_IMAGE_URL, - DEFAULT_GUIDE_IMAGE_URL, - DEFAULT_HOMEPAGE, - DEFAULT_USER_IMAGE_URLS, - DEFAULT_WIKI_IMAGE_URL, - NAME, - NOT_YET_AVAILABLE, - ROOT_DIR, - UNAVAILABLE_OFFLINE, -) - -LOCALE_LOCK = threading.Lock() - - -class ImageUrlNotFound(Exception): - pass - - -@contextmanager -def setlocale(name): - with LOCALE_LOCK: - saved = locale.setlocale(locale.LC_ALL) - try: - yield locale.setlocale(locale.LC_ALL, name) - finally: - locale.setlocale(locale.LC_ALL, saved) - - -class Global: - """Shared context accross all scraper components""" - - debug = False - logger = lib_getLogger( - NAME, - level=logging.INFO, - log_format="[%(threadName)s::%(asctime)s] %(levelname)s:%(message)s", - ) - conf = None - - metadata = {} - - creator = None - imager = None - env = None - lock = threading.Lock() - - null_categories = set() - ifixit_external_content = set() - final_hrefs = dict() - - @staticmethod - def set_debug(value): - Global.debug = value - level = logging.DEBUG if value else logging.INFO - Global.logger.setLevel(level) - for handler in Global.logger.handlers: - handler.setLevel(level) - - @staticmethod - def setup(): - # order matters are there are references between them - - # images handled on a different queue. - # mostly network I/O to retrieve and/or upload image. - # if not in S3 bucket, convert/optimize webp image - # svg images, stored but not optimized - from .executor import Executor - - Global.img_executor = Executor( - queue_size=100, - nb_workers=50, - prefix="IMG-T-", - ) - - from .imager import Imager - - Global.imager = Imager() - - Global.creator = Creator( - filename=Global.conf.output_dir.joinpath(Global.conf.fname), - main_path=DEFAULT_HOMEPAGE, - favicon_path="illustration", - language=Global.conf.language["iso-639-3"], - workaround_nocancel=False, - title=Global.conf.title, - description=Global.conf.description, - creator=Global.conf.author, - publisher=Global.conf.publisher, - name=Global.conf.name, - tags=";".join(Global.conf.tags), - date=date.today(), - ).config_verbose(True) - - # jinja2 environment setup - Global.env = Environment( - loader=FileSystemLoader(ROOT_DIR.joinpath("templates")), - autoescape=select_autoescape(), - ) - Global.env.globals["raise"] = Global._raise_helper - Global.env.globals["str"] = lambda x: str(x) - Global.env.filters["guides_in_progress"] = Global.guides_in_progress - Global.env.filters["category_count_parts"] = Global.category_count_parts - Global.env.filters["category_count_tools"] = Global.category_count_tools - Global.env.filters["get_image_path"] = Global.get_image_path - Global.env.filters["get_image_url"] = Global.get_image_url - Global.env.filters["cleanup_rendered_content"] = Global.cleanup_rendered_content - Global.env.filters[ - "get_timestamp_day_rendered" - ] = Global.get_timestamp_day_rendered - Global.env.filters["get_item_comments_count"] = Global.get_item_comments_count - Global.env.filters[ - "get_guide_total_comments_count" - ] = Global.get_guide_total_comments_count - Global.env.filters["get_user_display_name"] = Global.get_user_display_name - - @staticmethod - def _raise_helper(msg): - raise Exception(msg) - - @staticmethod - def guides_in_progress(guides, in_progress=True): - if in_progress: - return [guide for guide in guides if "GUIDE_IN_PROGRESS" in guide["flags"]] - return [guide for guide in guides if "GUIDE_IN_PROGRESS" not in guide["flags"]] - - @staticmethod - def category_count_parts(category): - if "parts" not in category: - return 0 - if "total" not in category["parts"]: - return 0 - return category["parts"]["total"] - - @staticmethod - def category_count_tools(category): - if "tools" not in category: - return 0 - return len(category["tools"]) - - @staticmethod - def get_image_path(image_url): - return Global.imager.defer(url=image_url) - - @staticmethod - def _get_image_url_search(obj, for_guide, for_device, for_wiki, for_user): - if "standard" in obj: - return obj["standard"] - if "medium" in obj: - return obj["medium"] - if "large" in obj: - return obj["large"] - if "original" in obj: - return obj["original"] - if for_guide: - return DEFAULT_GUIDE_IMAGE_URL - if for_device: - return DEFAULT_DEVICE_IMAGE_URL - if for_wiki: - return DEFAULT_WIKI_IMAGE_URL - if for_user and "userid" in obj: - idx = obj["userid"] % len(DEFAULT_USER_IMAGE_URLS) - return DEFAULT_USER_IMAGE_URLS[idx] - raise ImageUrlNotFound(f"Unable to find image URL in object {obj}") - - @staticmethod - def get_image_url( - obj, for_guide=False, for_device=False, for_wiki=False, for_user=False - ): - if "image" in obj and obj["image"]: - return Global._get_image_url_search( - obj["image"], for_guide, for_device, for_wiki, for_user - ) - return Global._get_image_url_search( - obj, for_guide, for_device, for_wiki, for_user - ) - - guide_regex_full = re.compile( - r"href=\"https://\w*\.ifixit\.\w*/Guide/.*/(?P\d*)\"" - ) - guide_regex_rel = re.compile(r"href=\"/Guide/.*/(?P\d*).*?\"") - - gbl_image_regex = r".*?)src\s*=\s*\"(?P.*?)\"" - gbl_href_regex = r"href\s*=\s*\"(?P.*?)\"" - gbl_youtube_regex = ( - r"(?!.*.+?)src=[\\\"']+(?P.+?)\"(?P.+?)" - ) - gbl_bgd_image_regex = ( - r"background-image:url\((?P"|\"|')" - r"(?P.*?)(?P"|\"|')\)" - ) - gbl_video_regex = r".*)" - gbl_iframe_regex = r".*?)\".*?" - gbl_regex = re.compile( - f"{gbl_image_regex}|{gbl_href_regex}|{gbl_youtube_regex}|{gbl_bgd_image_regex}" - f"|{gbl_video_regex}|{gbl_iframe_regex}" - ) - - href_anchor_regex = r"^(?P#.*)$" - href_object_kind_regex = ( - r"^(?:https*://[\w\.]*(?:ifixit)[\w\.]*)*/" - r"((?:(?P" - + "|".join(NOT_YET_AVAILABLE + UNAVAILABLE_OFFLINE) - + r")(?:/.+)?)" - r"|(?:(?PGuide|Anleitung|Guía|Guida|Tutoriel|Teardown)/" - r"(?P.+)/(?P\d+)(?P#.*)?.*)" - r"|(?:(?PDevice|Topic)/(?P[\w%_\.-]+)" - r"(?P#.*)?.*)" - r"|(?PUser)/(?P\d*)/(?P[\w%_\.+'-]+)" - r"(?P#.*)?.*" - r"|(?:(?PInfo)/(?P[\w%_\.-]+)(?P#.*)?.*))$" - ) - href_regex = re.compile( - f"{href_anchor_regex}|{href_object_kind_regex}", flags=re.IGNORECASE - ) - - @staticmethod - def _process_external_url(url, rel_prefix): - if "ifixit" in url: - Global.ifixit_external_content.add(url) - return f"{rel_prefix}home/external_content?url={urllib.parse.quote(url)}" - - @staticmethod - def _process_unrecognized_href(url, rel_prefix): - return Global._process_external_url(url, rel_prefix) - - def _process_href_regex_dynamics(href, rel_prefix): - if "Guide/login/register" in href or "Guide/new" in href: - return ( - f"{rel_prefix}home/unavailable_offline" - f"?url={urllib.parse.quote(href)}" - ) - return None - - def _process_href_regex_nomatch(href, rel_prefix, match): - if match: - return None - return Global._process_unrecognized_href(href, rel_prefix) - - def _process_href_regex_anchor(href, rel_prefix, match): - if not match.group("anchor"): - return None - return f"{match.group('anchor')}" - - def _process_href_regex_guide(href, rel_prefix, match): - if not match.group("guide"): - return None - link = Global.get_guide_link_from_props( - guideid=match.group("guideid"), - guidetitle=urllib.parse.unquote_plus(match.group("guidetitle")), - ) - return f"{rel_prefix}{link}{match.group('guideafter') or ''}" - - def _process_href_regex_device(href, rel_prefix, match): - if not match.group("device"): - return None - link = Global.get_category_link_from_props( - category_title=urllib.parse.unquote_plus(match.group("devicetitle")) - ) - return f"{rel_prefix}{link}{match.group('deviceafter') or ''}" - - def _process_href_regex_info(href, rel_prefix, match): - if not match.group("info"): - return None - link = Global.get_info_link_from_props( - info_title=urllib.parse.unquote_plus(match.group("infotitle")) - ) - return f"{rel_prefix}{link}" f"{match.group('infoafter') or ''}" - - def _process_href_regex_user(href, rel_prefix, match): - if not match.group("user"): - return None - link = Global.get_user_link_from_props( - userid=match.group("userid"), - usertitle=urllib.parse.unquote_plus(match.group("usertitle")), - ) - return f"{rel_prefix}{link}" f"{match.group('userafter') or ''}" - - def _process_href_regex_kind(href, rel_prefix, match): - if not match.group("kind"): - return None - if match.group("kind").lower() in NOT_YET_AVAILABLE: - return ( - f"{rel_prefix}home/not_yet_available" f"?url={urllib.parse.quote(href)}" - ) - if match.group("kind").lower() in UNAVAILABLE_OFFLINE: - return ( - f"{rel_prefix}home/unavailable_offline" - f"?url={urllib.parse.quote(href)}" - ) - raise Exception( - f"Unsupported kind '{match.group('kind')}' in _process_href_regex" - ) - - @staticmethod - def normalize_href(href): - if href in Global.final_hrefs: - return Global.final_hrefs[href] - try: - logger.debug(f"Normalizing href {href}") - # final_href = requests.head(href).headers.get("Location") - # if final_href is None: - # logger.debug(f"Failed to HEAD {href}, falling back to GET") - final_href = requests.get(href, stream=True).url - # parse final href and remove scheme + netloc + slash - parsed_final_href = urllib.parse.urlparse(final_href) - parsed_href = urllib.parse.urlparse(href) - chars_to_remove = len(parsed_final_href.scheme + "://") - - # remove domain if redirect is on same domain (almost always) - if parsed_final_href.netloc == parsed_href.netloc: - chars_to_remove += len(parsed_final_href.netloc) - - final_href = final_href[chars_to_remove:] - final_href = urllib.parse.unquote(final_href) - except Exception: - # this is quite expected for some missing items ; this will be taken care - # of at retrieval, no way to do something better - final_href = href - Global.final_hrefs[href] = final_href - logger.debug(f"Result is {final_href}") - return final_href - - @staticmethod - def _process_href_regex(href, rel_prefix): - if href.startswith("/"): - href = Global.conf.main_url.geturl() + href - if href.startswith("http") and "ifixit.com/" in href: - href = Global.normalize_href(href) - href = urllib.parse.quote(href) - match = Global.href_regex.search(href) - res = ( - Global._process_href_regex_dynamics(href, rel_prefix) - or Global._process_href_regex_nomatch(href, rel_prefix, match) - or Global._process_href_regex_anchor(href, rel_prefix, match) - or Global._process_href_regex_guide(href, rel_prefix, match) - or Global._process_href_regex_device(href, rel_prefix, match) - or Global._process_href_regex_info(href, rel_prefix, match) - or Global._process_href_regex_user(href, rel_prefix, match) - or Global._process_href_regex_kind(href, rel_prefix, match) - ) - if res is None: - raise Exception("Unsupported match in _process_href_regex") - return res - - @staticmethod - def _process_youtube(match, rel_prefix): - return ( - f'" - f"" - ) - - @staticmethod - def _process_bgdimgurl(match, rel_prefix): - return ( - f"background-image:url({match.group('quote1')}{rel_prefix}" - f"{Global.get_image_path(match.group('bgdimgurl'))}" - f"{match.group('quote2')})" - ) - - @staticmethod - def _process_video(match, rel_prefix): - return "

Video not scrapped

" - - @staticmethod - def _process_iframe(match, rel_prefix): - return ( - f'External content' - ) - - @staticmethod - def _process_gbl_regex(match, rel_prefix): - if match.group("image_url"): - return ( - f" 0: - return user["username"] - if user["unique_username"] and len(user["unique_username"]) > 0: - return f"@{user['unique_username']}" - return "Anonymous" - - -class GlobalMixin: - @property - def conf(self): - return Global.conf - - @property - def metadata(self): - return Global.metadata - - @property - def creator(self): - return Global.creator - - @property - def lock(self): - return Global.lock - - @property - def imager(self): - return Global.imager - - @property - def executor(self): - return Global.executor - - @property - def env(self): - return Global.env - - @property - def info_wiki_template(self): - return Global.info_wiki_template - - @property - def ifixit_external_content(self): - return Global.ifixit_external_content - - -logger = Global.logger diff --git a/ifixit2zim/utils.py b/ifixit2zim/utils.py deleted file mode 100644 index 22ab801..0000000 --- a/ifixit2zim/utils.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - -import io -import re -import urllib.parse -import zlib -from typing import Union - -import backoff -import bs4 -import requests -from kiwixstorage import KiwixStorage -from pif import get_public_ip -from zimscraperlib.download import _get_retry_adapter, stream_file - -from .constants import API_PREFIX -from .shared import Global, logger - - -def to_path(url: str) -> str: - """Path-part of an URL, without leading slash""" - return re.sub(r"^/", "", urllib.parse.urlparse(url).path) - - -def get_url(path: str, **params) -> str: - """url-encoded in-source website url for a path""" - params_str = f"?{urllib.parse.urlencode(params)}" if params else "" - return f"{Global.conf.main_url.geturl()}{urllib.parse.quote(path)}{params_str}" - - -def get_url_raw(path: str): - """in-source website url for a path, untainted""" - return f"{Global.conf.main_url.geturl()}{path}" - - -def to_url(value: str) -> str: - """resolved potentially relative url from in-source link""" - return value if value.startswith("http") else get_url_raw(value) - - -def to_rel(url: str) -> Union[None, str]: - """path from URL if on our main domain, else None""" - uri = urllib.parse.urlparse(url) - if uri.netloc != Global.conf.domain: - return None - return uri.path - - -def no_leading_slash(text: str) -> str: - """text with leading slash removed if present""" - return re.sub(r"^/", "", text) - - -def no_trailing_slash(text: str) -> str: - """text with trailing slash removed if present""" - return re.sub(r"/$", "", text) - - -def only_path_of(url: str): - """normalized path part of an url""" - return normalize_ident(urllib.parse.urlparse(url).path) - - -def fetch(path: str, **params) -> str: - """(source text, actual_paths) of a path from source website - - actual_paths is amn ordered list of paths that were traversed to get to content. - Without redirection, it should be a single path, equal to request - Final, target path is always last""" - session = requests.Session() - session.mount("http", _get_retry_adapter(10)) # tied to http and https - resp = session.get(get_url(path, **params), params=params) - resp.raise_for_status() - - # we have params meaning we requested a page (?pg=xxx) - # assumption: this must be a category page (so on same domain) - # we thus need to use redirection target (which lost param) with params - if params and resp.history: - return fetch(only_path_of(resp.url), **params) - return resp.text, [ - no_leading_slash(only_path_of(r.url)) for r in resp.history + [resp] - ] - - -def get_soup_of(text: str, unwrap: bool = False): - """an lxml soup of an HTML string""" - soup = bs4.BeautifulSoup(text, "lxml") - if unwrap: - for elem in ("body", "html"): - getattr(soup, elem).unwrap() - return soup - - -def get_soup(path: str, **params) -> bs4.BeautifulSoup: - """an lxml soup of a path on source website""" - content, paths = fetch(path, **params) - return get_soup_of(content), paths - - -def get_digest(url: str) -> str: - """simple digest of an url for mapping purpose""" - return str(zlib.adler32(url.encode("UTF-8"))) - - -def normalize_ident(ident: str) -> str: - """URL-decoded category identifier""" - return urllib.parse.unquote(ident) - - -def get_version_ident_for(url: str) -> str: - """~version~ of the URL data to use for comparisons. Built from headers""" - try: - resp = requests.head(url) - headers = resp.headers - except Exception as exc: - logger.warning(f"Unable to HEAD {url}") - logger.exception(exc) - try: - _, headers = stream_file( - url=url, - byte_stream=io.BytesIO(), - block_size=1, - only_first_block=True, - ) - except Exception as exc2: - logger.warning(f"Unable to query image at {url}") - logger.exception(exc2) - return - - for header in ("ETag", "Last-Modified", "Content-Length"): - if headers.get(header): - return headers.get(header) - - return "-1" - - -def setup_s3_and_check_credentials(s3_url_with_credentials): - logger.info("testing S3 Optimization Cache credentials") - s3_storage = KiwixStorage(s3_url_with_credentials) - if not s3_storage.check_credentials( - list_buckets=True, bucket=True, write=True, read=True, failsafe=True - ): - logger.error("S3 cache connection error testing permissions.") - logger.error(f" Server: {s3_storage.url.netloc}") - logger.error(f" Bucket: {s3_storage.bucket_name}") - logger.error(f" Key ID: {s3_storage.params.get('keyid')}") - logger.error(f" Public IP: {get_public_ip()}") - raise ValueError("Unable to connect to Optimization Cache. Check its URL.") - return s3_storage - - -def backoff_hdlr(details): - logger.warning( - "Backing off {wait:0.1f} seconds after {tries} tries " - "calling function {target} with args {args} and kwargs " - "{kwargs}".format(**details) - ) - - -@backoff.on_exception( - backoff.expo, - requests.exceptions.RequestException, - max_time=16, - on_backoff=backoff_hdlr, -) -def get_api_content(path, **params): - full_path = get_url(API_PREFIX + path, **params) - logger.debug(f"Retrieving {full_path}") - response = requests.get(full_path) - json_data = response.json() if response and response.status_code == 200 else None - return json_data diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e2dcff0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,235 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "ifixit2zim" +authors = [ + { name = "Kiwix", email = "dev@kiwix.org" }, +] +keywords = ["kiwix","zim","offline","ifixit"] +requires-python = ">=3.12,<3.13" +description = "Make ZIM file from iFixit guides" +readme = "README.md" +license = {text = "GPL-3.0-or-later"} +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", +] +dependencies = [ + "requests==2.31.0", + "zimscraperlib==3.3.1", + "kiwixstorage==0.8.3", + "Jinja2==3.1.3", + "backoff==2.2.1", + "pif==0.8.2", + "schedule==1.2.1", +] +dynamic = ["version"] + +[project.optional-dependencies] +scripts = [ + "invoke==2.2.0", +] +lint = [ + "black==24.2.0", + "ruff==0.3.0", +] +check = [ + "pyright==1.1.352", +] +test = [ + "pytest==8.0.2", + "coverage==7.4.3", +] +dev = [ + "pre-commit==3.6.2", + "debugpy==1.8.1", + "ifixit2zim[scripts]", + "ifixit2zim[lint]", + "ifixit2zim[test]", + "ifixit2zim[check]", +] + +[project.urls] +Homepage = "https://github.com/openzim/ifixit2zim" +Donate = "https://www.kiwix.org/en/support-us/" + +[project.scripts] +ifixit2zim = "ifixit2zim.__main__:main" + +[tool.hatch.version] +path = "src/ifixit2zim/__about__.py" + +[tool.hatch.build] +exclude = [ + "/.github", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/ifixit2zim"] + +[tool.hatch.envs.default] +features = ["dev"] + +[tool.hatch.envs.test] +features = ["scripts", "test"] + +[tool.hatch.envs.test.scripts] +run = "inv test --args '{args}'" +run-cov = "inv test-cov --args '{args}'" +report-cov = "inv report-cov" +coverage = "inv coverage --args '{args}'" +html = "inv coverage --html --args '{args}'" + +[tool.hatch.envs.lint] +template = "lint" +skip-install = false +features = ["scripts", "lint"] + +[tool.hatch.envs.lint.scripts] +black = "inv lint-black --args '{args}'" +ruff = "inv lint-ruff --args '{args}'" +all = "inv lintall --args '{args}'" +fix-black = "inv fix-black --args '{args}'" +fix-ruff = "inv fix-ruff --args '{args}'" +fixall = "inv fixall --args '{args}'" + +[tool.hatch.envs.check] +features = ["scripts", "check"] + +[tool.hatch.envs.check.scripts] +pyright = "inv check-pyright --args '{args}'" +all = "inv checkall --args '{args}'" + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.ruff] +target-version = "py312" +line-length = 88 +src = ["src"] + +[tool.ruff.lint] +select = [ + "A", # flake8-builtins + # "ANN", # flake8-annotations + "ARG", # flake8-unused-arguments + # "ASYNC", # flake8-async + "B", # flake8-bugbear + # "BLE", # flake8-blind-except + "C4", # flake8-comprehensions + "C90", # mccabe + # "COM", # flake8-commas + # "D", # pydocstyle + # "DJ", # flake8-django + "DTZ", # flake8-datetimez + "E", # pycodestyle (default) + "EM", # flake8-errmsg + # "ERA", # eradicate + # "EXE", # flake8-executable + "F", # Pyflakes (default) + # "FA", # flake8-future-annotations + "FBT", # flake8-boolean-trap + # "FLY", # flynt + # "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + # "INP", # flake8-no-pep420 + # "INT", # flake8-gettext + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + # "NPY", # NumPy-specific rules + # "PD", # pandas-vet + # "PGH", # pygrep-hooks + # "PIE", # flake8-pie + # "PL", # Pylint + "PLC", # Pylint: Convention + "PLE", # Pylint: Error + "PLR", # Pylint: Refactor + "PLW", # Pylint: Warning + # "PT", # flake8-pytest-style + # "PTH", # flake8-use-pathlib + # "PYI", # flake8-pyi + "Q", # flake8-quotes + # "RET", # flake8-return + # "RSE", # flake8-raise + "RUF", # Ruff-specific rules + "S", # flake8-bandit + # "SIM", # flake8-simplify + # "SLF", # flake8-self + "T10", # flake8-debugger + "T20", # flake8-print + # "TCH", # flake8-type-checking + # "TD", # flake8-todos + "TID", # flake8-tidy-imports + # "TRY", # tryceratops + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Remove flake8-errmsg since we consider they bloat the code and provide limited value + "EM", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore warnings on subprocess.run / popen + "S603", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.lint.isort] +known-first-party = ["ifixit2zim"] + +[tool.ruff.lint.flake8-bugbear] +# add exceptions to B008 for fastapi. +extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.pytest.ini_options] +minversion = "7.3" +testpaths = ["tests"] +pythonpath = [".", "src"] + +[tool.coverage.paths] +great_project = ["src/ifixit2zim"] +tests = ["tests"] + +[tool.coverage.run] +source_pkgs = ["ifixit2zim"] +branch = true +parallel = true +omit = [ + "src/ifixit2zim/__about__.py", +] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pyright] +include = ["src", "tests", "tasks.py"] +exclude = [".env/**", ".venv/**"] +extraPaths = ["src"] +pythonVersion = "3.12" +typeCheckingMode="basic" diff --git a/requirements.pip b/requirements.pip deleted file mode 100644 index 2513b11..0000000 --- a/requirements.pip +++ /dev/null @@ -1,7 +0,0 @@ -requests>=2.27.0,<2.28 -zimscraperlib>=1.6.0,<1.7 -kiwixstorage>=0.8.2,<0.9 -Jinja2>=3.1.2,<3.2 -backoff>=2.0.1,<2.1 -pif>=0.8.2,<0.9 -schedule>=1.1.0,<1.2 diff --git a/setup.py b/setup.py deleted file mode 100644 index 947a15f..0000000 --- a/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -# -*- coding: utf-8 -*- - -import pathlib - -from setuptools import setup - -root_dir = pathlib.Path(__file__).parent - - -def read(*names, **kwargs): - with open(root_dir.joinpath(*names), "r") as fh: - return fh.read() - - -setup( - name="ifixit2zim", - version=read("ifixit2zim", "VERSION").strip(), - description="Make ZIM file from iFixit articles", - long_description=read("README.md"), - long_description_content_type="text/markdown", - author="Kiwix Team", - author_email="dev@kiwix.org", - url="https://kiwix.org/", - keywords="kiwix zim offline ifixit", - license="GPLv3+", - packages=["ifixit2zim"], - install_requires=[ - line.strip() - for line in read("requirements.pip").splitlines() - if not line.strip().startswith("#") and not line.startswith("https://") - ], - zip_safe=False, - include_package_data=True, - package_data={"": ["VERSION", "templates/*", "assets/*"]}, - entry_points={ - "console_scripts": [ - "ifixit2zim=ifixit2zim.__main__:main", - ] - }, - classifiers=[ - "Programming Language :: Python", - "Programming Language :: Python :: 3.8", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - ], - python_requires=">=3.8", -) diff --git a/src/ifixit2zim/__about__.py b/src/ifixit2zim/__about__.py new file mode 100644 index 0000000..c585b2e --- /dev/null +++ b/src/ifixit2zim/__about__.py @@ -0,0 +1 @@ +__version__ = "0.3.0-dev0" diff --git a/ifixit2zim/__init__.py b/src/ifixit2zim/__init__.py similarity index 100% rename from ifixit2zim/__init__.py rename to src/ifixit2zim/__init__.py diff --git a/src/ifixit2zim/__main__.py b/src/ifixit2zim/__main__.py new file mode 100644 index 0000000..e38e096 --- /dev/null +++ b/src/ifixit2zim/__main__.py @@ -0,0 +1,4 @@ +from ifixit2zim.entrypoint import main + +if __name__ == "__main__": + main() diff --git a/ifixit2zim/assets/16px_11px_5.png b/src/ifixit2zim/assets/16px_11px_5.png similarity index 100% rename from ifixit2zim/assets/16px_11px_5.png rename to src/ifixit2zim/assets/16px_11px_5.png diff --git a/ifixit2zim/assets/32px_32px_5.png b/src/ifixit2zim/assets/32px_32px_5.png similarity index 100% rename from ifixit2zim/assets/32px_32px_5.png rename to src/ifixit2zim/assets/32px_32px_5.png diff --git a/ifixit2zim/assets/FrameModules-translation_credit-qBLVoFL8fSDpJmDUuduPTA.css b/src/ifixit2zim/assets/FrameModules-translation_credit-qBLVoFL8fSDpJmDUuduPTA.css similarity index 100% rename from ifixit2zim/assets/FrameModules-translation_credit-qBLVoFL8fSDpJmDUuduPTA.css rename to src/ifixit2zim/assets/FrameModules-translation_credit-qBLVoFL8fSDpJmDUuduPTA.css diff --git a/ifixit2zim/assets/GuideNoImage_300x225.jpg b/src/ifixit2zim/assets/GuideNoImage_300x225.jpg similarity index 100% rename from ifixit2zim/assets/GuideNoImage_300x225.jpg rename to src/ifixit2zim/assets/GuideNoImage_300x225.jpg diff --git a/ifixit2zim/assets/NoImage_300x225.jpg b/src/ifixit2zim/assets/NoImage_300x225.jpg similarity index 100% rename from ifixit2zim/assets/NoImage_300x225.jpg rename to src/ifixit2zim/assets/NoImage_300x225.jpg diff --git a/ifixit2zim/assets/NoImage_56x42.jpg b/src/ifixit2zim/assets/NoImage_56x42.jpg similarity index 100% rename from ifixit2zim/assets/NoImage_56x42.jpg rename to src/ifixit2zim/assets/NoImage_56x42.jpg diff --git a/ifixit2zim/assets/NoImage_96x72.jpg b/src/ifixit2zim/assets/NoImage_96x72.jpg similarity index 100% rename from ifixit2zim/assets/NoImage_96x72.jpg rename to src/ifixit2zim/assets/NoImage_96x72.jpg diff --git a/ifixit2zim/assets/PIE.htc b/src/ifixit2zim/assets/PIE.htc similarity index 100% rename from ifixit2zim/assets/PIE.htc rename to src/ifixit2zim/assets/PIE.htc diff --git a/ifixit2zim/assets/S6u8w4BMUTPHjxsAUi-qNiXg7eU0.woff2 b/src/ifixit2zim/assets/S6u8w4BMUTPHjxsAUi-qNiXg7eU0.woff2 similarity index 100% rename from ifixit2zim/assets/S6u8w4BMUTPHjxsAUi-qNiXg7eU0.woff2 rename to src/ifixit2zim/assets/S6u8w4BMUTPHjxsAUi-qNiXg7eU0.woff2 diff --git a/ifixit2zim/assets/S6u8w4BMUTPHjxsAXC-qNiXg7Q.woff2 b/src/ifixit2zim/assets/S6u8w4BMUTPHjxsAXC-qNiXg7Q.woff2 similarity index 100% rename from ifixit2zim/assets/S6u8w4BMUTPHjxsAXC-qNiXg7Q.woff2 rename to src/ifixit2zim/assets/S6u8w4BMUTPHjxsAXC-qNiXg7Q.woff2 diff --git a/ifixit2zim/assets/S6u9w4BMUTPHh6UVSwaPGQ3q5d0N7w.woff2 b/src/ifixit2zim/assets/S6u9w4BMUTPHh6UVSwaPGQ3q5d0N7w.woff2 similarity index 100% rename from ifixit2zim/assets/S6u9w4BMUTPHh6UVSwaPGQ3q5d0N7w.woff2 rename to src/ifixit2zim/assets/S6u9w4BMUTPHh6UVSwaPGQ3q5d0N7w.woff2 diff --git a/ifixit2zim/assets/S6u9w4BMUTPHh6UVSwiPGQ3q5d0.woff2 b/src/ifixit2zim/assets/S6u9w4BMUTPHh6UVSwiPGQ3q5d0.woff2 similarity index 100% rename from ifixit2zim/assets/S6u9w4BMUTPHh6UVSwiPGQ3q5d0.woff2 rename to src/ifixit2zim/assets/S6u9w4BMUTPHh6UVSwiPGQ3q5d0.woff2 diff --git a/ifixit2zim/assets/S6uyw4BMUTPHjx4wXiWtFCc.woff2 b/src/ifixit2zim/assets/S6uyw4BMUTPHjx4wXiWtFCc.woff2 similarity index 100% rename from ifixit2zim/assets/S6uyw4BMUTPHjx4wXiWtFCc.woff2 rename to src/ifixit2zim/assets/S6uyw4BMUTPHjx4wXiWtFCc.woff2 diff --git a/ifixit2zim/assets/S6uyw4BMUTPHjxAwXiWtFCfQ7A.woff2 b/src/ifixit2zim/assets/S6uyw4BMUTPHjxAwXiWtFCfQ7A.woff2 similarity index 100% rename from ifixit2zim/assets/S6uyw4BMUTPHjxAwXiWtFCfQ7A.woff2 rename to src/ifixit2zim/assets/S6uyw4BMUTPHjxAwXiWtFCfQ7A.woff2 diff --git a/ifixit2zim/assets/Shared-attachment_link-AoWbgS-g65jo1DYOaHV5XA.css b/src/ifixit2zim/assets/Shared-attachment_link-AoWbgS-g65jo1DYOaHV5XA.css similarity index 100% rename from ifixit2zim/assets/Shared-attachment_link-AoWbgS-g65jo1DYOaHV5XA.css rename to src/ifixit2zim/assets/Shared-attachment_link-AoWbgS-g65jo1DYOaHV5XA.css diff --git a/ifixit2zim/assets/Shared-cart_banner-33Ctp6kCy0R-IiTsFeV6cw.css b/src/ifixit2zim/assets/Shared-cart_banner-33Ctp6kCy0R-IiTsFeV6cw.css similarity index 100% rename from ifixit2zim/assets/Shared-cart_banner-33Ctp6kCy0R-IiTsFeV6cw.css rename to src/ifixit2zim/assets/Shared-cart_banner-33Ctp6kCy0R-IiTsFeV6cw.css diff --git a/ifixit2zim/assets/Shared-i18n_formatting-7XRaMqur0Z-hJvP-W8sS2A.css b/src/ifixit2zim/assets/Shared-i18n_formatting-7XRaMqur0Z-hJvP-W8sS2A.css similarity index 100% rename from ifixit2zim/assets/Shared-i18n_formatting-7XRaMqur0Z-hJvP-W8sS2A.css rename to src/ifixit2zim/assets/Shared-i18n_formatting-7XRaMqur0Z-hJvP-W8sS2A.css diff --git a/ifixit2zim/assets/Shared-print-ej-m-RsicBzcpqbxdfzumQ.css b/src/ifixit2zim/assets/Shared-print-ej-m-RsicBzcpqbxdfzumQ.css similarity index 100% rename from ifixit2zim/assets/Shared-print-ej-m-RsicBzcpqbxdfzumQ.css rename to src/ifixit2zim/assets/Shared-print-ej-m-RsicBzcpqbxdfzumQ.css diff --git a/ifixit2zim/assets/Wiki-common-Zf5O-KLmFhhZ0w9cRZYZoQ.css b/src/ifixit2zim/assets/Wiki-common-Zf5O-KLmFhhZ0w9cRZYZoQ.css similarity index 100% rename from ifixit2zim/assets/Wiki-common-Zf5O-KLmFhhZ0w9cRZYZoQ.css rename to src/ifixit2zim/assets/Wiki-common-Zf5O-KLmFhhZ0w9cRZYZoQ.css diff --git a/ifixit2zim/assets/Wiki-topic-r_spN9srKqcGQAC8emdeTA.css b/src/ifixit2zim/assets/Wiki-topic-r_spN9srKqcGQAC8emdeTA.css similarity index 100% rename from ifixit2zim/assets/Wiki-topic-r_spN9srKqcGQAC8emdeTA.css rename to src/ifixit2zim/assets/Wiki-topic-r_spN9srKqcGQAC8emdeTA.css diff --git a/ifixit2zim/assets/area_index-BDTBciD-Y7NVVjoPQBUyhA.css b/src/ifixit2zim/assets/area_index-BDTBciD-Y7NVVjoPQBUyhA.css similarity index 100% rename from ifixit2zim/assets/area_index-BDTBciD-Y7NVVjoPQBUyhA.css rename to src/ifixit2zim/assets/area_index-BDTBciD-Y7NVVjoPQBUyhA.css diff --git a/ifixit2zim/assets/badge_icons_20110608.png b/src/ifixit2zim/assets/badge_icons_20110608.png similarity index 100% rename from ifixit2zim/assets/badge_icons_20110608.png rename to src/ifixit2zim/assets/badge_icons_20110608.png diff --git a/ifixit2zim/assets/camera-large-add-2.png b/src/ifixit2zim/assets/camera-large-add-2.png similarity index 100% rename from ifixit2zim/assets/camera-large-add-2.png rename to src/ifixit2zim/assets/camera-large-add-2.png diff --git a/ifixit2zim/assets/camera-small-add-2.png b/src/ifixit2zim/assets/camera-small-add-2.png similarity index 100% rename from ifixit2zim/assets/camera-small-add-2.png rename to src/ifixit2zim/assets/camera-small-add-2.png diff --git a/ifixit2zim/assets/check1x.png b/src/ifixit2zim/assets/check1x.png similarity index 100% rename from ifixit2zim/assets/check1x.png rename to src/ifixit2zim/assets/check1x.png diff --git a/ifixit2zim/assets/check2x.png b/src/ifixit2zim/assets/check2x.png similarity index 100% rename from ifixit2zim/assets/check2x.png rename to src/ifixit2zim/assets/check2x.png diff --git a/ifixit2zim/assets/checklarge1x.png b/src/ifixit2zim/assets/checklarge1x.png similarity index 100% rename from ifixit2zim/assets/checklarge1x.png rename to src/ifixit2zim/assets/checklarge1x.png diff --git a/ifixit2zim/assets/checklarge2x.png b/src/ifixit2zim/assets/checklarge2x.png similarity index 100% rename from ifixit2zim/assets/checklarge2x.png rename to src/ifixit2zim/assets/checklarge2x.png diff --git a/ifixit2zim/assets/checksmall1x.png b/src/ifixit2zim/assets/checksmall1x.png similarity index 100% rename from ifixit2zim/assets/checksmall1x.png rename to src/ifixit2zim/assets/checksmall1x.png diff --git a/ifixit2zim/assets/checksmall2x.png b/src/ifixit2zim/assets/checksmall2x.png similarity index 100% rename from ifixit2zim/assets/checksmall2x.png rename to src/ifixit2zim/assets/checksmall2x.png diff --git a/ifixit2zim/assets/core-ETWCjfemWxBbJHaoXafxFg.css b/src/ifixit2zim/assets/core-ETWCjfemWxBbJHaoXafxFg.css similarity index 100% rename from ifixit2zim/assets/core-ETWCjfemWxBbJHaoXafxFg.css rename to src/ifixit2zim/assets/core-ETWCjfemWxBbJHaoXafxFg.css diff --git a/ifixit2zim/assets/core-primitives-F5WnAWhrwpl7oCtqtgogQQ.css b/src/ifixit2zim/assets/core-primitives-F5WnAWhrwpl7oCtqtgogQQ.css similarity index 100% rename from ifixit2zim/assets/core-primitives-F5WnAWhrwpl7oCtqtgogQQ.css rename to src/ifixit2zim/assets/core-primitives-F5WnAWhrwpl7oCtqtgogQQ.css diff --git a/ifixit2zim/assets/css2.css b/src/ifixit2zim/assets/css2.css similarity index 100% rename from ifixit2zim/assets/css2.css rename to src/ifixit2zim/assets/css2.css diff --git a/ifixit2zim/assets/customZimHelpers-1.js b/src/ifixit2zim/assets/customZimHelpers-1.js similarity index 100% rename from ifixit2zim/assets/customZimHelpers-1.js rename to src/ifixit2zim/assets/customZimHelpers-1.js diff --git a/ifixit2zim/assets/document-add.png b/src/ifixit2zim/assets/document-add.png similarity index 100% rename from ifixit2zim/assets/document-add.png rename to src/ifixit2zim/assets/document-add.png diff --git a/ifixit2zim/assets/favicon.ico b/src/ifixit2zim/assets/favicon.ico similarity index 100% rename from ifixit2zim/assets/favicon.ico rename to src/ifixit2zim/assets/favicon.ico diff --git a/ifixit2zim/assets/font-awesome-HTdabjFBu1PkVsVncBZulw.css b/src/ifixit2zim/assets/font-awesome-HTdabjFBu1PkVsVncBZulw.css similarity index 100% rename from ifixit2zim/assets/font-awesome-HTdabjFBu1PkVsVncBZulw.css rename to src/ifixit2zim/assets/font-awesome-HTdabjFBu1PkVsVncBZulw.css diff --git a/ifixit2zim/assets/guide-all-evkgScrziDY3Uq33ElIkNA.css b/src/ifixit2zim/assets/guide-all-evkgScrziDY3Uq33ElIkNA.css similarity index 100% rename from ifixit2zim/assets/guide-all-evkgScrziDY3Uq33ElIkNA.css rename to src/ifixit2zim/assets/guide-all-evkgScrziDY3Uq33ElIkNA.css diff --git a/ifixit2zim/assets/helptaboverlay.png b/src/ifixit2zim/assets/helptaboverlay.png similarity index 100% rename from ifixit2zim/assets/helptaboverlay.png rename to src/ifixit2zim/assets/helptaboverlay.png diff --git a/ifixit2zim/assets/helptabshadow.png b/src/ifixit2zim/assets/helptabshadow.png similarity index 100% rename from ifixit2zim/assets/helptabshadow.png rename to src/ifixit2zim/assets/helptabshadow.png diff --git a/ifixit2zim/assets/home/AdQTqqV252aRMVdf.jpg b/src/ifixit2zim/assets/home/AdQTqqV252aRMVdf.jpg similarity index 100% rename from ifixit2zim/assets/home/AdQTqqV252aRMVdf.jpg rename to src/ifixit2zim/assets/home/AdQTqqV252aRMVdf.jpg diff --git a/ifixit2zim/assets/home/H4M1JEQiZabe4Vo2.jpg b/src/ifixit2zim/assets/home/H4M1JEQiZabe4Vo2.jpg similarity index 100% rename from ifixit2zim/assets/home/H4M1JEQiZabe4Vo2.jpg rename to src/ifixit2zim/assets/home/H4M1JEQiZabe4Vo2.jpg diff --git a/ifixit2zim/assets/home/IlacTC2EdoTKtKdC.jpg b/src/ifixit2zim/assets/home/IlacTC2EdoTKtKdC.jpg similarity index 100% rename from ifixit2zim/assets/home/IlacTC2EdoTKtKdC.jpg rename to src/ifixit2zim/assets/home/IlacTC2EdoTKtKdC.jpg diff --git a/ifixit2zim/assets/home/JxaijlWopNJnHo2S.jpg b/src/ifixit2zim/assets/home/JxaijlWopNJnHo2S.jpg similarity index 100% rename from ifixit2zim/assets/home/JxaijlWopNJnHo2S.jpg rename to src/ifixit2zim/assets/home/JxaijlWopNJnHo2S.jpg diff --git a/ifixit2zim/assets/home/P1WJqUOlaXZYNVSD.jpg b/src/ifixit2zim/assets/home/P1WJqUOlaXZYNVSD.jpg similarity index 100% rename from ifixit2zim/assets/home/P1WJqUOlaXZYNVSD.jpg rename to src/ifixit2zim/assets/home/P1WJqUOlaXZYNVSD.jpg diff --git a/ifixit2zim/assets/home/RacpAWWRuobmX5g4.jpg b/src/ifixit2zim/assets/home/RacpAWWRuobmX5g4.jpg similarity index 100% rename from ifixit2zim/assets/home/RacpAWWRuobmX5g4.jpg rename to src/ifixit2zim/assets/home/RacpAWWRuobmX5g4.jpg diff --git a/ifixit2zim/assets/home/firsttimerepairing_banner-2.jpg b/src/ifixit2zim/assets/home/firsttimerepairing_banner-2.jpg similarity index 100% rename from ifixit2zim/assets/home/firsttimerepairing_banner-2.jpg rename to src/ifixit2zim/assets/home/firsttimerepairing_banner-2.jpg diff --git a/ifixit2zim/assets/home/hKrLIluHRDXxUAit.jpg b/src/ifixit2zim/assets/home/hKrLIluHRDXxUAit.jpg similarity index 100% rename from ifixit2zim/assets/home/hKrLIluHRDXxUAit.jpg rename to src/ifixit2zim/assets/home/hKrLIluHRDXxUAit.jpg diff --git a/ifixit2zim/assets/home/macbook-pro.jpg b/src/ifixit2zim/assets/home/macbook-pro.jpg similarity index 100% rename from ifixit2zim/assets/home/macbook-pro.jpg rename to src/ifixit2zim/assets/home/macbook-pro.jpg diff --git a/ifixit2zim/assets/icomoon-gzipped_20210215.woff b/src/ifixit2zim/assets/icomoon-gzipped_20210215.woff similarity index 100% rename from ifixit2zim/assets/icomoon-gzipped_20210215.woff rename to src/ifixit2zim/assets/icomoon-gzipped_20210215.woff diff --git a/ifixit2zim/assets/icomoon_20160111.eot b/src/ifixit2zim/assets/icomoon_20160111.eot similarity index 100% rename from ifixit2zim/assets/icomoon_20160111.eot rename to src/ifixit2zim/assets/icomoon_20160111.eot diff --git a/ifixit2zim/assets/icomoon_20160111.svg b/src/ifixit2zim/assets/icomoon_20160111.svg similarity index 100% rename from ifixit2zim/assets/icomoon_20160111.svg rename to src/ifixit2zim/assets/icomoon_20160111.svg diff --git a/ifixit2zim/assets/icomoon_20160111.ttf b/src/ifixit2zim/assets/icomoon_20160111.ttf similarity index 100% rename from ifixit2zim/assets/icomoon_20160111.ttf rename to src/ifixit2zim/assets/icomoon_20160111.ttf diff --git a/ifixit2zim/assets/illustration.png b/src/ifixit2zim/assets/illustration.png similarity index 100% rename from ifixit2zim/assets/illustration.png rename to src/ifixit2zim/assets/illustration.png diff --git a/ifixit2zim/assets/kits-small.jpg b/src/ifixit2zim/assets/kits-small.jpg similarity index 100% rename from ifixit2zim/assets/kits-small.jpg rename to src/ifixit2zim/assets/kits-small.jpg diff --git a/ifixit2zim/assets/kits.jpg b/src/ifixit2zim/assets/kits.jpg similarity index 100% rename from ifixit2zim/assets/kits.jpg rename to src/ifixit2zim/assets/kits.jpg diff --git a/ifixit2zim/assets/loader.gif b/src/ifixit2zim/assets/loader.gif similarity index 100% rename from ifixit2zim/assets/loader.gif rename to src/ifixit2zim/assets/loader.gif diff --git a/ifixit2zim/assets/media-upload-types.png b/src/ifixit2zim/assets/media-upload-types.png similarity index 100% rename from ifixit2zim/assets/media-upload-types.png rename to src/ifixit2zim/assets/media-upload-types.png diff --git a/ifixit2zim/assets/module-all-a4ubLUywxaL0H1WJD5LLgQ.css b/src/ifixit2zim/assets/module-all-a4ubLUywxaL0H1WJD5LLgQ.css similarity index 100% rename from ifixit2zim/assets/module-all-a4ubLUywxaL0H1WJD5LLgQ.css rename to src/ifixit2zim/assets/module-all-a4ubLUywxaL0H1WJD5LLgQ.css diff --git a/ifixit2zim/assets/moto_g5_plus_4.jpg b/src/ifixit2zim/assets/moto_g5_plus_4.jpg similarity index 100% rename from ifixit2zim/assets/moto_g5_plus_4.jpg rename to src/ifixit2zim/assets/moto_g5_plus_4.jpg diff --git a/ifixit2zim/assets/new-guide-view-all-Zs-aI_CApaXZ_ssFDlTZ9g.css b/src/ifixit2zim/assets/new-guide-view-all-Zs-aI_CApaXZ_ssFDlTZ9g.css similarity index 100% rename from ifixit2zim/assets/new-guide-view-all-Zs-aI_CApaXZ_ssFDlTZ9g.css rename to src/ifixit2zim/assets/new-guide-view-all-Zs-aI_CApaXZ_ssFDlTZ9g.css diff --git a/ifixit2zim/assets/not_here.js b/src/ifixit2zim/assets/not_here.js similarity index 100% rename from ifixit2zim/assets/not_here.js rename to src/ifixit2zim/assets/not_here.js diff --git a/ifixit2zim/assets/prosemirror-all-_OBJ3KkZRD0uygPKzpMb8Q.css b/src/ifixit2zim/assets/prosemirror-all-_OBJ3KkZRD0uygPKzpMb8Q.css similarity index 100% rename from ifixit2zim/assets/prosemirror-all-_OBJ3KkZRD0uygPKzpMb8Q.css rename to src/ifixit2zim/assets/prosemirror-all-_OBJ3KkZRD0uygPKzpMb8Q.css diff --git a/ifixit2zim/assets/release-version-orbcTfqm6_JKsoz-PPnHGA.css b/src/ifixit2zim/assets/release-version-orbcTfqm6_JKsoz-PPnHGA.css similarity index 100% rename from ifixit2zim/assets/release-version-orbcTfqm6_JKsoz-PPnHGA.css rename to src/ifixit2zim/assets/release-version-orbcTfqm6_JKsoz-PPnHGA.css diff --git a/ifixit2zim/assets/repair-score-bad.svg b/src/ifixit2zim/assets/repair-score-bad.svg similarity index 100% rename from ifixit2zim/assets/repair-score-bad.svg rename to src/ifixit2zim/assets/repair-score-bad.svg diff --git a/ifixit2zim/assets/repair-score-good.svg b/src/ifixit2zim/assets/repair-score-good.svg similarity index 100% rename from ifixit2zim/assets/repair-score-good.svg rename to src/ifixit2zim/assets/repair-score-good.svg diff --git a/ifixit2zim/assets/repair-score-neutral.svg b/src/ifixit2zim/assets/repair-score-neutral.svg similarity index 100% rename from ifixit2zim/assets/repair-score-neutral.svg rename to src/ifixit2zim/assets/repair-score-neutral.svg diff --git a/ifixit2zim/assets/replace-large.png b/src/ifixit2zim/assets/replace-large.png similarity index 100% rename from ifixit2zim/assets/replace-large.png rename to src/ifixit2zim/assets/replace-large.png diff --git a/ifixit2zim/assets/replace-small.png b/src/ifixit2zim/assets/replace-small.png similarity index 100% rename from ifixit2zim/assets/replace-small.png rename to src/ifixit2zim/assets/replace-small.png diff --git a/ifixit2zim/assets/right-to-repair.jpg b/src/ifixit2zim/assets/right-to-repair.jpg similarity index 100% rename from ifixit2zim/assets/right-to-repair.jpg rename to src/ifixit2zim/assets/right-to-repair.jpg diff --git a/ifixit2zim/assets/spinner.gif b/src/ifixit2zim/assets/spinner.gif similarity index 100% rename from ifixit2zim/assets/spinner.gif rename to src/ifixit2zim/assets/spinner.gif diff --git a/ifixit2zim/assets/sprite_guide_edit3.png b/src/ifixit2zim/assets/sprite_guide_edit3.png similarity index 100% rename from ifixit2zim/assets/sprite_guide_edit3.png rename to src/ifixit2zim/assets/sprite_guide_edit3.png diff --git a/ifixit2zim/assets/tooltip_sprite.png b/src/ifixit2zim/assets/tooltip_sprite.png similarity index 100% rename from ifixit2zim/assets/tooltip_sprite.png rename to src/ifixit2zim/assets/tooltip_sprite.png diff --git a/ifixit2zim/assets/transparency-50pxa.png b/src/ifixit2zim/assets/transparency-50pxa.png similarity index 100% rename from ifixit2zim/assets/transparency-50pxa.png rename to src/ifixit2zim/assets/transparency-50pxa.png diff --git a/ifixit2zim/assets/video-large-add-2.png b/src/ifixit2zim/assets/video-large-add-2.png similarity index 100% rename from ifixit2zim/assets/video-large-add-2.png rename to src/ifixit2zim/assets/video-large-add-2.png diff --git a/ifixit2zim/assets/view-question-20091109.gif b/src/ifixit2zim/assets/view-question-20091109.gif similarity index 100% rename from ifixit2zim/assets/view-question-20091109.gif rename to src/ifixit2zim/assets/view-question-20091109.gif diff --git a/ifixit2zim/assets/view_profile-LAZ9O7S0EMQ9_BZEO-F8OQ.css b/src/ifixit2zim/assets/view_profile-LAZ9O7S0EMQ9_BZEO-F8OQ.css similarity index 100% rename from ifixit2zim/assets/view_profile-LAZ9O7S0EMQ9_BZEO-F8OQ.css rename to src/ifixit2zim/assets/view_profile-LAZ9O7S0EMQ9_BZEO-F8OQ.css diff --git a/ifixit2zim/constants.py b/src/ifixit2zim/constants.py similarity index 87% rename from ifixit2zim/constants.py rename to src/ifixit2zim/constants.py index 0f779d4..796b6c5 100644 --- a/ifixit2zim/constants.py +++ b/src/ifixit2zim/constants.py @@ -1,23 +1,18 @@ -# -*- coding: utf-8 -*- - import pathlib import tempfile import urllib.parse -from dataclasses import dataclass, field -from typing import List, Optional, Set from zimscraperlib.i18n import get_language_details +from ifixit2zim.__about__ import __version__ + ROOT_DIR = pathlib.Path(__file__).parent NAME = ROOT_DIR.name DEFAULT_HOMEPAGE = "Main-Page" UNKNOWN_LOCALE = "unknown" UNKNOWN_TITLE = "unknown_title" -with open(ROOT_DIR.joinpath("VERSION"), "r") as fh: - VERSION = fh.read().strip() - -SCRAPER = f"{NAME} {VERSION}" +SCRAPER = f"{NAME} {__version__}" IMAGES_ENCODER_VERSION = 1 URLS = { @@ -148,6 +143,24 @@ # https://www.ifixit.com/Guide/MacBook+Air+11-Inch+Late+2010+Battery+Replacement/4384 # https://www.ifixit.com/Teardown/Apple+Watch+Teardown/40655 +TITLE = { + "en": { + "title_en": "iFixit in English", + "title_fr": "iFixit in French", + "title_pt": "iFixit in Portuguese", + "title_de": "iFixit in German", + "title_ko": "iFixit in Korean", + "title_zh": "iFixit in Chinese", + "title_ru": "iFixit in Russian", + "title_nl": "iFixit in Dutch", + "title_ja": "iFixit in Japanese", + "title_tr": "iFixit in Turkish", + "title_es": "iFixit in Spanish", + "title_it": "iFixit in Italian", + }, + "fr": {"title_fr": "iFixit en Français"}, +} + HOME_LABELS = { "en": {"top_title": "Repair guides for every thing, written by everyone."}, "fr": {"top_title": "Tutoriels de réparation pour tout, écrits par tous."}, @@ -158,7 +171,7 @@ "ru": {"top_title": "Руководства по ремонту всего, от всех."}, "nl": {"top_title": "Reparatiehandleidingen voor alles, door iedereen."}, "ja": {"top_title": "修理を愛する人たちが作った、あらゆるモノへの修理ガイド"}, - "tr": {"top_title": "Herkes tarafından, her şey için yazılmış tamir kılavuzları."}, + "tr": {"top_title": "Herkes tarafindan, her şey için yazilmiş tamir kilavuzlari."}, "es": {"top_title": "Guías de reparación para todo, escritas por todos."}, "it": {"top_title": "Guide di riparazione per ogni cosa, scritte da tutti."}, } @@ -254,7 +267,7 @@ "disassembly_guides": "분해 안내서", "tools": "도구", "parts": "부품", - "tools_introduction": ("해당 기기를 고치는데 사용하는 일반 도구들 입니다. 매 단계에 모든 도구를 사용하지는 않습니다."), + "tools_introduction": "해당 기기를 고치는데 사용하는 일반 도구들 입니다. 매 단계에 모든 도구를 사용하지는 않습니다.", # noqa E501 }, "zh": { "author": "作者: ", @@ -266,11 +279,11 @@ "in_progress_guides": "正在编写中的指南", "repairability": "可修复性:", "replacement_guides": "更换指南", - "teardown_guides": "拆​解", + "teardown_guides": "拆\u200b解", "disassembly_guides": "拆卸指南", "tools": "工具", "parts": "配件", - "tools_introduction": ("这是用于在这个设备上工作的一些常用工具。你可能不需要在每个过程中使用到每个工具。"), + "tools_introduction": "这是用于在这个设备上工作的一些常用工具。你可能不需要在每个过程中使用到每个工具。", # noqa E501 }, "ru": { "author": "Автор: ", @@ -326,9 +339,7 @@ "disassembly_guides": "分解ガイド", "tools": "ツール", "parts": "パーツ", - "tools_introduction": ( - "以前、このデバイスの修理に使われていた一般的な工具です。修理過程において全部の工具が必要" "とは限りません。" - ), + "tools_introduction": "以前、このデバイスの修理に使われていた一般的な工具です。修理過程において全部の工具が必要とは限りません。", # noqa E501 }, "tr": { "author": "Yazar: ", @@ -337,16 +348,16 @@ "featured_guides": "Featured Guides", # not present for now on website ... "technique_guides": "Teknikler", "related_pages": "İlgili Sayfalar", - "in_progress_guides": "Yapım Aşamasındaki Kılavuzlar", - "repairability": "Onarılabilirlik:", - "replacement_guides": "Parça Değişim Kılavuzları", + "in_progress_guides": "Yapim Aşamasindaki Kilavuzlar", + "repairability": "Onarilabilirlik:", + "replacement_guides": "Parça Değişim Kilavuzlari", "teardown_guides": "Teardown'lar", - "disassembly_guides": "Söküm Kılavuzları", + "disassembly_guides": "Söküm Kilavuzlari", "tools": "Aletler", "parts": "Parçalar", "tools_introduction": ( - "Bunlar, bu cihaz için yaygınca kullanılan bazı aletler. Her işlem için " - "her alete ihtiyacınız yoktur." + "Bunlar, bu cihaz için yayginca kullanilan bazi aletler. Her işlem için " + "her alete ihtiyaciniz yoktur." ), }, "it": { @@ -518,7 +529,7 @@ "parts": "부품", }, "zh": { - "written_by": "撰写者:", + "written_by": "撰写者:", "difficulty": "难度", "steps": "步骤", "time_required": "所需时间", @@ -532,7 +543,7 @@ "reputation": "信誉积分", "member_since_before": "", "member_since_after": "日注册", - "published": "发布于:", + "published": "发布于:", "teardown": " 拆解", "comments_count_before": "", "comments_count_after": "条评论", @@ -555,7 +566,7 @@ "conclusion": "Заключение", "author": "Автор", "reputation": "Репутация", - "member_since_before": "Участник с: ", + "member_since_before": "Участник c: ", "member_since_after": "", "published": "Опубликовано: ", "teardown": "Разбираем", @@ -593,7 +604,7 @@ "parts": "Onderdelen", }, "ja": { - "written_by": "作成者:", + "written_by": "作成者:", "difficulty": "難易度", "steps": "手順", "time_required": "所要時間", @@ -620,19 +631,19 @@ "tr": { "written_by": "Yazan:", "difficulty": "Zorluk", - "steps": " Adımlar", + "steps": " Adimlar", "time_required": "Gerekli Süre", "sections": "Bölümler", "flags": "İşaretler", "introduction": "Giriş", - "step_no_before": "Adım ", + "step_no_before": "Adim ", "step_no_after": "", "conclusion": "Sonuç", "author": "Yazar", "reputation": "İtibar", "member_since_before": "Üyelik tarihi: ", "member_since_after": "", - "published": "Yayımlama: ", + "published": "Yayimlama: ", "teardown": "Teardown", "comments_count_before": "", "comments_count_after": " yorum", @@ -728,7 +739,7 @@ }, "ru": { "reputation": "Репутация", - "member_since_before": "Пользователь с ", + "member_since_before": "Пользователь c ", "member_since_after": "", }, "nl": { @@ -806,97 +817,85 @@ UNAVAILABLE_OFFLINE_INFOS = ["toolkits"] -@dataclass -class Conf: - required = [ - "lang_code", - "output_dir", - ] - - lang_code: str = "" - language: dict = field(default_factory=dict) - main_url: str = "" +class Configuration: + fpath: pathlib.Path # zim params - name: str = "" - title: Optional[str] = "" - description: Optional[str] = "" - author: Optional[str] = "" - publisher: Optional[str] = "" - fname: Optional[str] = "" - tag: List[str] = field(default_factory=list) - - # customization - icon: Optional[str] = "" - categories: Set[str] = field(default_factory=set) - no_category: Optional[bool] = False - guides: Set[str] = field(default_factory=set) - no_guide: Optional[bool] = False - infos: Set[str] = field(default_factory=set) - no_info: Optional[bool] = False - users: Set[str] = field(default_factory=set) - no_user: Optional[bool] = False - no_cleanup: Optional[bool] = False + name: str + title: str + description: str + long_description: str | None + author: str + publisher: str + fname: str + tag: list[str] # filesystem - _output_dir: Optional[str] = "." - _tmp_dir: Optional[str] = "." - output_dir: Optional[pathlib.Path] = None - tmp_dir: Optional[pathlib.Path] = None + _output_name: str + _tmp_name: str + output_path: pathlib.Path + tmp_path: pathlib.Path - # performances - nb_threads: Optional[int] = -1 - s3_url_with_credentials: Optional[str] = "" - - # error handling - max_missing_items_percent: Optional[int] = 0 - max_error_items_percent: Optional[int] = 0 + required = ( + "lang_code", + "output_path", + ) - # debug/devel - build_dir_is_tmp_dir: Optional[bool] = False - keep_build_dir: Optional[bool] = False - scrape_only_first_items: Optional[bool] = False - debug: Optional[bool] = False - delay: Optional[float] = 0 - api_delay: Optional[float] = 0 - cdn_delay: Optional[float] = 0 - stats_filename: Optional[str] = None - skip_checks: Optional[bool] = False + lang_code: str + language: dict + main_url: urllib.parse.ParseResult - @staticmethod - def get_url(lang_code: str) -> urllib.parse.ParseResult: - return urllib.parse.urlparse(URLS[lang_code]) - - @property - def domain(self) -> str: - return self.main_url.netloc + # customization + icon: str + categories: set[str] + no_category: bool + guides: set[str] + no_guide: bool + infos: set[str] + no_info: bool + users: set[str] + no_user: bool + no_cleanup: bool - @property - def api_url(self) -> str: - return self.main_url + API_PREFIX + # performances + s3_url_with_credentials: str | None - @property - def s3_url(self) -> str: - return self.s3_url_with_credentials + # error handling + max_missing_items_percent: int + max_error_items_percent: int - def __post_init__(self): - self.main_url = Conf.get_url(self.lang_code) + # debug/devel + build_dir_is_tmp_dir: bool + keep_build_dir: bool + scrape_only_first_items: bool + debug: bool + delay: float + api_delay: float + cdn_delay: float + stats_filename: str | None + skip_checks: bool + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + self.__setattr__(key, value) + self.main_url = Configuration.get_url(self.lang_code) self.language = get_language_details(self.lang_code) - self.output_dir = pathlib.Path(self._output_dir).expanduser().resolve() - self.output_dir.mkdir(parents=True, exist_ok=True) + self.output_path = pathlib.Path(self._output_name).expanduser().resolve() + self.output_path.mkdir(parents=True, exist_ok=True) - self.tmp_dir = pathlib.Path(self._tmp_dir).expanduser().resolve() - self.tmp_dir.mkdir(parents=True, exist_ok=True) + self.tmp_path = pathlib.Path(self._tmp_name).expanduser().resolve() + self.tmp_path.mkdir(parents=True, exist_ok=True) if self.build_dir_is_tmp_dir: - self.build_dir = self.tmp_dir + self.build_path = self.tmp_path else: - self.build_dir = pathlib.Path( - tempfile.mkdtemp(prefix=f"ifixit_{self.lang_code}_", dir=self.tmp_dir) + self.build_path = pathlib.Path( + tempfile.mkdtemp(prefix=f"ifixit_{self.lang_code}_", dir=self.tmp_path) ) + self.stats_path = None if self.stats_filename: - self.stats_filename = pathlib.Path(self.stats_filename).expanduser() - self.stats_filename.parent.mkdir(parents=True, exist_ok=True) + self.stats_path = pathlib.Path(self.stats_filename).expanduser() + self.stats_path.parent.mkdir(parents=True, exist_ok=True) # support semi-colon separated tags as well if self.tag: @@ -904,3 +903,19 @@ def __post_init__(self): if ";" in tag: self.tag += [p.strip() for p in tag.split(";")] self.tag.remove(tag) + + @staticmethod + def get_url(lang_code: str) -> urllib.parse.ParseResult: + return urllib.parse.urlparse(URLS[lang_code]) + + @property + def domain(self) -> str: + return self.main_url.netloc + + @property + def api_url(self) -> str: + return self.main_url.geturl() + API_PREFIX + + @property + def s3_url(self) -> str | None: + return self.s3_url_with_credentials diff --git a/src/ifixit2zim/context.py b/src/ifixit2zim/context.py new file mode 100644 index 0000000..f502258 --- /dev/null +++ b/src/ifixit2zim/context.py @@ -0,0 +1,21 @@ +import threading +from dataclasses import dataclass +from typing import Any + +from jinja2 import Environment +from zimscraperlib.zim.creator import Creator + +from ifixit2zim.processor import Processor +from ifixit2zim.scraper import Configuration +from ifixit2zim.utils import Utils + + +@dataclass +class Context: + lock: threading.Lock + configuration: Configuration + creator: Creator + utils: Utils + metadata: dict[str, Any] + env: Environment + processor: Processor diff --git a/ifixit2zim/entrypoint.py b/src/ifixit2zim/entrypoint.py similarity index 87% rename from ifixit2zim/entrypoint.py rename to src/ifixit2zim/entrypoint.py index 85063a4..ab2fd4b 100755 --- a/ifixit2zim/entrypoint.py +++ b/src/ifixit2zim/entrypoint.py @@ -4,8 +4,8 @@ import os import sys -from .constants import NAME, SCRAPER, URLS -from .shared import Global, logger +from ifixit2zim.constants import NAME, SCRAPER, URLS +from ifixit2zim.shared import logger, set_debug def main(): @@ -26,7 +26,7 @@ def main(): "--output", help="Output folder for ZIM file", default="/output", - dest="_output_dir", + dest="_output_name", ) parser.add_argument( @@ -37,29 +37,37 @@ def main(): parser.add_argument( "--title", - help="Custom title for your ZIM. iFixit homepage title otherwise", + help="Custom title for your ZIM (30 chars max).", ) parser.add_argument( "--description", - help="Custom description for your ZIM. " - "iFixit homepage description (meta) otherwise", + help="Custom description for your ZIM (80 chars max). " + "Based on iFixit homepage description (meta) otherwise", + ) + + parser.add_argument( + "--long-description", + help="Custom long description for your ZIM (4000 chars max). " + "Based on iFixit homepage description (meta) otherwise", ) parser.add_argument( "--icon", - help="Custom icon for your ZIM (path or URL). " "iFixit square logo otherwise", + help="Custom icon for your ZIM (path or URL). iFixit square logo otherwise", ) parser.add_argument( "--creator", help="Name of content creator. “iFixit” otherwise", dest="author", + default="iFixit", ) parser.add_argument( "--publisher", help="Custom publisher name (ZIM metadata). “openZIM” otherwise", + default="openZIM", ) parser.add_argument( @@ -87,6 +95,7 @@ def main(): "--debug", help="Enable verbose output", action="store_true", + dest="debug", default=False, ) @@ -94,7 +103,7 @@ def main(): "--tmp-dir", help="Path to create temp folder in. Used for building ZIM file.", default=os.getenv("TMPDIR", "."), - dest="_tmp_dir", + dest="_tmp_name", ) parser.add_argument( @@ -257,18 +266,18 @@ def main(): ) args = parser.parse_args() - Global.set_debug(args.debug) + set_debug(args.debug) - from .scraper import ifixit2zim + from ifixit2zim.scraper import IFixit2Zim try: - scraper = ifixit2zim(**dict(args._get_kwargs())) + scraper = IFixit2Zim(**dict(args._get_kwargs())) sys.exit(scraper.run()) except Exception as exc: - logger.error(f"FAILED. An error occurred: {exc}") + logger.error("FAILED. An error occurred", exc_info=exc) if args.debug: logger.exception(exc) - raise SystemExit(1) + raise SystemExit(1) from None if __name__ == "__main__": diff --git a/src/ifixit2zim/exceptions.py b/src/ifixit2zim/exceptions.py new file mode 100644 index 0000000..ed0c562 --- /dev/null +++ b/src/ifixit2zim/exceptions.py @@ -0,0 +1,14 @@ +class FinalScrapingFailureError(Exception): + pass + + +class UnexpectedDataKindExceptionError(Exception): + pass + + +class CategoryHomePageContentError(Exception): + pass + + +class ImageUrlNotFoundError(Exception): + pass diff --git a/ifixit2zim/executor.py b/src/ifixit2zim/executor.py similarity index 96% rename from ifixit2zim/executor.py rename to src/ifixit2zim/executor.py index 9bc7ce1..ff83214 100644 --- a/ifixit2zim/executor.py +++ b/src/ifixit2zim/executor.py @@ -1,12 +1,11 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu import queue import threading -from typing import Callable +from collections.abc import Callable -from .shared import logger +from ifixit2zim.shared import logger _shutdown = False # Lock that ensures that new workers are not created while the interpreter is @@ -58,7 +57,7 @@ def submit(self, task: Callable, **kwargs): if not self.alive: raise RuntimeError("cannot submit task to dead executor") if _shutdown: - raise RuntimeError("cannot submit task after " "interpreter shutdown") + raise RuntimeError("cannot submit task after interpreter shutdown") while True: try: @@ -141,7 +140,7 @@ def release_halt(self): """release the `no_more` flag preventing workers from taking up tasks""" self.no_more = False - def shutdown(self, wait=True): + def shutdown(self, *, wait=True): """stop the executor, either somewhat immediately or awaiting completion""" logger.debug(f"shutting down executor {self.prefix} with {wait=}") with self._shutdown_lock: diff --git a/ifixit2zim/imager.py b/src/ifixit2zim/imager.py similarity index 78% rename from ifixit2zim/imager.py rename to src/ifixit2zim/imager.py index 35529b8..e5c5410 100644 --- a/ifixit2zim/imager.py +++ b/src/ifixit2zim/imager.py @@ -1,33 +1,46 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu import hashlib import io import pathlib import re +import threading import urllib.parse from kiwixstorage import KiwixStorage, NotFoundError from PIL import Image from zimscraperlib.download import stream_file from zimscraperlib.image.optimization import optimize_webp +from zimscraperlib.zim.creator import Creator -from .constants import IMAGES_ENCODER_VERSION -from .shared import Global -from .utils import get_version_ident_for, to_url - -logger = Global.logger +from ifixit2zim.constants import IMAGES_ENCODER_VERSION +from ifixit2zim.executor import Executor +from ifixit2zim.scraper import Configuration +from ifixit2zim.shared import logger +from ifixit2zim.utils import Utils class Imager: - def __init__(self): + def __init__( + self, + img_executor: Executor, + lock: threading.Lock, + creator: Creator, + utils: Utils, + configuration: Configuration, + ): self.aborted = False # list of source URLs that we've processed and added to ZIM self.handled = set() - self.dedup_items = dict() + self.dedup_items = {} + self.img_executor = img_executor + self.lock = lock + self.creator = creator + self.utils = utils + self.configuration = configuration - Global.img_executor.start() + self.img_executor.start() def abort(self): """request imager to cancel processing of futures""" @@ -53,7 +66,7 @@ def get_image_data(self, url: str) -> io.BytesIO: lossless=False, quality=60, method=6, - ) + ) # pyright: ignore[reportReturnType] def get_path_for(self, url: urllib.parse.ParseResult) -> str: url_with_only_path = urllib.parse.ParseResult( @@ -67,21 +80,23 @@ def get_path_for(self, url: urllib.parse.ParseResult) -> str: unquoted_url = urllib.parse.unquote(url_with_only_path.geturl()) return "images/{}".format(re.sub(r"^(https?)://", r"\1/", unquoted_url)) - def defer(self, url: str) -> str: + def defer(self, url: str) -> str | None: """request full processing of url, returning in-zim path immediately""" # find actual URL should it be from a provider try: - url = urllib.parse.urlparse(to_url(url)) + parsed_url = urllib.parse.urlparse(self.utils.to_url(url)) except Exception: logger.warning(f"Can't parse image URL `{url}`. Skipping") return - if url.scheme not in ("http", "https"): - logger.warning(f"Not supporting image URL `{url.geturl()}`. Skipping") + if parsed_url.scheme not in ("http", "https"): + logger.warning( + f"Not supporting image URL `{parsed_url.geturl()}`. Skipping" + ) return - path = self.get_path_for(url) + path = self.get_path_for(parsed_url) if path in self.handled: return path @@ -89,9 +104,9 @@ def defer(self, url: str) -> str: # record that we are processing this one self.handled.add(path) - Global.img_executor.submit( + self.img_executor.submit( self.process_image, - url=url, + url=parsed_url, path=path, mimetype="image/svg+xml" if path.endswith(".svg") else "image/webp", dont_release=True, @@ -108,34 +123,36 @@ def check_for_duplicate(self, path, content): def add_image_to_zim(self, path, content, mimetype): duplicate_path = self.check_for_duplicate(path, content) - with Global.lock: + with self.lock: if duplicate_path: - Global.creator.add_redirect( + self.creator.add_redirect( path=path, target_path=duplicate_path, ) else: - Global.creator.add_item_for( + self.creator.add_item_for( path=path, content=content, mimetype=mimetype, ) def add_missing_image_to_zim(self, path): - with Global.lock: - Global.creator.add_redirect( + with self.lock: + self.creator.add_redirect( path=path, target_path="assets/NoImage_300x225.jpg", ) - def process_image(self, url: str, path: str, mimetype: str) -> str: + def process_image( + self, url: urllib.parse.ParseResult, path: str, mimetype: str + ) -> str | None: """download image from url or S3 and add to Zim at path. Upload if req.""" if self.aborted: return # just download, optimize and add to ZIM if not using S3 - if not Global.conf.s3_url: + if not self.configuration.s3_url: try: fileobj = self.get_image_data(url.geturl()) except Exception as exc: @@ -157,7 +174,7 @@ def process_image(self, url: str, path: str, mimetype: str) -> str: return path # we are using S3 cache - ident = get_version_ident_for(url.geturl()) + ident = self.utils.get_version_ident_for(url.geturl()) if ident is None: logger.error(f"Unable to query {url.geturl()}. Skipping") self.add_missing_image_to_zim( @@ -166,7 +183,7 @@ def process_image(self, url: str, path: str, mimetype: str) -> str: return path # key = self.get_s3_key_for(url.geturl()) - s3_storage = KiwixStorage(Global.conf.s3_url) + s3_storage = KiwixStorage(self.configuration.s3_url) meta = {"ident": ident, "encoder_version": str(IMAGES_ENCODER_VERSION)} download_failed = False # useful to trigger reupload or not diff --git a/src/ifixit2zim/processor.py b/src/ifixit2zim/processor.py new file mode 100644 index 0000000..815581e --- /dev/null +++ b/src/ifixit2zim/processor.py @@ -0,0 +1,412 @@ +import datetime +import re +import threading +import urllib.parse + +import requests +from zimscraperlib.zim.creator import Creator + +from ifixit2zim.constants import ( + DEFAULT_DEVICE_IMAGE_URL, + DEFAULT_GUIDE_IMAGE_URL, + DEFAULT_USER_IMAGE_URLS, + DEFAULT_WIKI_IMAGE_URL, + NOT_YET_AVAILABLE, + UNAVAILABLE_OFFLINE, +) +from ifixit2zim.exceptions import ImageUrlNotFoundError +from ifixit2zim.imager import Imager +from ifixit2zim.scraper import Configuration +from ifixit2zim.shared import logger, setlocale + + +class Processor: + def __init__( + self, + lock: threading.Lock, + configuration: Configuration, + creator: Creator, + imager: Imager, + ) -> None: + self.null_categories = set() + self.ifixit_external_content = set() + self.final_hrefs = {} + self.lock = lock + self.configuration = configuration + self.creator = creator + self.imager = imager + + @property + def get_guide_link_from_props(self): + return self._get_guide_link_from_props + + @get_guide_link_from_props.setter + def get_guide_link_from_props(self, get_guide_link_from_props): + self._get_guide_link_from_props = get_guide_link_from_props + + @property + def get_category_link_from_props(self): + return self._get_category_link_from_props + + @get_category_link_from_props.setter + def get_category_link_from_props(self, get_category_link_from_props): + self._get_category_link_from_props = get_category_link_from_props + + @property + def get_info_link_from_props(self): + return self._get_info_link_from_props + + @get_info_link_from_props.setter + def get_info_link_from_props(self, get_info_link_from_props): + self._get_info_link_from_props = get_info_link_from_props + + @property + def get_user_link_from_props(self): + return self._get_user_link_from_props + + @get_user_link_from_props.setter + def get_user_link_from_props(self, get_user_link_from_props): + self._get_user_link_from_props = get_user_link_from_props + + # no-qa flag is mandatory because this is used in a Jinja filter and arg names are + # never passed by Jinja + def guides_in_progress(self, guides, in_progress=True): # noqa: FBT002 + if in_progress: + return [guide for guide in guides if "GUIDE_IN_PROGRESS" in guide["flags"]] + return [guide for guide in guides if "GUIDE_IN_PROGRESS" not in guide["flags"]] + + def category_count_parts(self, category): + if "parts" not in category: + return 0 + if "total" not in category["parts"]: + return 0 + return category["parts"]["total"] + + def category_count_tools(self, category): + if "tools" not in category: + return 0 + return len(category["tools"]) + + def get_image_path(self, image_url): + return self.imager.defer(url=image_url) + + def _get_image_url_search( + self, obj, *, for_guide: bool, for_device: bool, for_wiki: bool, for_user: bool + ) -> str: + if "standard" in obj: + return obj["standard"] + if "medium" in obj: + return obj["medium"] + if "large" in obj: + return obj["large"] + if "original" in obj: + return obj["original"] + if for_guide: + return DEFAULT_GUIDE_IMAGE_URL + if for_device: + return DEFAULT_DEVICE_IMAGE_URL + if for_wiki: + return DEFAULT_WIKI_IMAGE_URL + if for_user and "userid" in obj: + idx = obj["userid"] % len(DEFAULT_USER_IMAGE_URLS) + return DEFAULT_USER_IMAGE_URLS[idx] + raise ImageUrlNotFoundError(f"Unable to find image URL in object {obj}") + + def get_image_url( + self, obj, *, for_guide=False, for_device=False, for_wiki=False, for_user=False + ) -> str: + if obj.get("image"): + return self._get_image_url_search( + obj["image"], + for_guide=for_guide, + for_device=for_device, + for_wiki=for_wiki, + for_user=for_user, + ) + return self._get_image_url_search( + obj, + for_guide=for_guide, + for_device=for_device, + for_wiki=for_wiki, + for_user=for_user, + ) + + guide_regex_full = re.compile( + r"href=\"https://\w*\.ifixit\.\w*/Guide/.*/(?P\d*)\"" + ) + guide_regex_rel = re.compile(r"href=\"/Guide/.*/(?P\d*).*?\"") + + gbl_image_regex = r".*?)src\s*=\s*\"(?P.*?)\"" + gbl_href_regex = r"href\s*=\s*\"(?P.*?)\"" + gbl_youtube_regex = ( + r"(?!.*.+?)src=[\\\"']+(?P.+?)\"(?P.+?)" + ) + gbl_bgd_image_regex = ( + r"background-image:url\((?P"|\"|')" + r"(?P.*?)(?P"|\"|')\)" + ) + gbl_video_regex = r".*)" + gbl_iframe_regex = r".*?)\".*?" + gbl_regex = re.compile( + f"{gbl_image_regex}|{gbl_href_regex}|{gbl_youtube_regex}|{gbl_bgd_image_regex}" + f"|{gbl_video_regex}|{gbl_iframe_regex}" + ) + + href_anchor_regex = r"^(?P#.*)$" + href_object_kind_regex = ( + r"^(?:https*://[\w\.]*(?:ifixit)[\w\.]*)*/" + r"((?:(?P" + + "|".join(NOT_YET_AVAILABLE + UNAVAILABLE_OFFLINE) + + r")(?:/.+)?)" + r"|(?:(?PGuide|Anleitung|Guía|Guida|Tutoriel|Teardown)/" + r"(?P.+)/(?P\d+)(?P#.*)?.*)" + r"|(?:(?PDevice|Topic)/(?P[\w%_\.-]+)" + r"(?P#.*)?.*)" + r"|(?PUser)/(?P\d*)/(?P[\w%_\.+'-]+)" + r"(?P#.*)?.*" + r"|(?:(?PInfo)/(?P[\w%_\.-]+)(?P#.*)?.*))$" + ) + href_regex = re.compile( + f"{href_anchor_regex}|{href_object_kind_regex}", flags=re.IGNORECASE + ) + + def _process_external_url(self, url, rel_prefix): + if "ifixit" in url: + self.ifixit_external_content.add(url) + return f"{rel_prefix}home/external_content?url={urllib.parse.quote(url)}" + + def _process_unrecognized_href(self, url, rel_prefix): + return self._process_external_url(url, rel_prefix) + + def _process_href_regex_dynamics(self, href, rel_prefix): + if "Guide/login/register" in href or "Guide/new" in href: + return ( + f"{rel_prefix}home/unavailable_offline" + f"?url={urllib.parse.quote(href)}" + ) + return None + + def _process_href_regex_nomatch(self, href, rel_prefix, match): + if match: + return None + return self._process_unrecognized_href(href, rel_prefix) + + def _process_href_regex_anchor(self, match): + if not match.group("anchor"): + return None + return f"{match.group('anchor')}" + + def _process_href_regex_guide(self, rel_prefix, match): + if not match.group("guide"): + return None + link = self.get_guide_link_from_props( + guideid=match.group("guideid"), + guidetitle=urllib.parse.unquote_plus(match.group("guidetitle")), + ) + return f"{rel_prefix}{link}{match.group('guideafter') or ''}" + + def _process_href_regex_device(self, rel_prefix, match): + if not match.group("device"): + return None + link = self.get_category_link_from_props( + category_title=urllib.parse.unquote_plus(match.group("devicetitle")) + ) + return f"{rel_prefix}{link}{match.group('deviceafter') or ''}" + + def _process_href_regex_info(self, rel_prefix, match): + if not match.group("info"): + return None + link = self.get_info_link_from_props( + info_title=urllib.parse.unquote_plus(match.group("infotitle")) + ) + return f"{rel_prefix}{link}{match.group('infoafter') or ''}" + + def _process_href_regex_user(self, rel_prefix, match): + if not match.group("user"): + return None + link = self.get_user_link_from_props( + userid=match.group("userid"), + usertitle=urllib.parse.unquote_plus(match.group("usertitle")), + ) + return f"{rel_prefix}{link}{match.group('userafter') or ''}" + + def _process_href_regex_kind(self, href, rel_prefix, match): + if not match.group("kind"): + return None + if match.group("kind").lower() in NOT_YET_AVAILABLE: + return f"{rel_prefix}home/not_yet_available?url={urllib.parse.quote(href)}" + if match.group("kind").lower() in UNAVAILABLE_OFFLINE: + return ( + f"{rel_prefix}home/unavailable_offline" + f"?url={urllib.parse.quote(href)}" + ) + raise Exception( + f"Unsupported kind '{match.group('kind')}' in _process_href_regex" + ) + + def normalize_href(self, href): + if href in self.final_hrefs: + return self.final_hrefs[href] + try: + logger.debug(f"Normalizing href {href}") + # final_href = requests.head(href).headers.get("Location") + # if final_href is None: + # logger.debug(f"Failed to HEAD {href}, falling back to GET") + final_href = requests.get(href, stream=True, timeout=10).url + # parse final href and remove scheme + netloc + slash + parsed_final_href = urllib.parse.urlparse(final_href) + parsed_href = urllib.parse.urlparse(href) + chars_to_remove = len(parsed_final_href.scheme + "://") + + # remove domain if redirect is on same domain (almost always) + if parsed_final_href.netloc == parsed_href.netloc: + chars_to_remove += len(parsed_final_href.netloc) + + final_href = final_href[chars_to_remove:] + final_href = urllib.parse.unquote(final_href) + except Exception: + # this is quite expected for some missing items ; this will be taken care + # of at retrieval, no way to do something better + final_href = href + self.final_hrefs[href] = final_href + logger.debug(f"Result is {final_href}") + return final_href + + def _process_href_regex(self, href, rel_prefix): + if href.startswith("/"): + href = self.configuration.main_url.geturl() + href + if href.startswith("http") and "ifixit.com/" in href: + href = self.normalize_href(href) + href = urllib.parse.quote(href) + match = self.href_regex.search(href) + res = ( + self._process_href_regex_dynamics(href=href, rel_prefix=rel_prefix) + or self._process_href_regex_nomatch( + href=href, rel_prefix=rel_prefix, match=match + ) + or self._process_href_regex_anchor(match=match) + or self._process_href_regex_guide(rel_prefix=rel_prefix, match=match) + or self._process_href_regex_device(rel_prefix=rel_prefix, match=match) + or self._process_href_regex_info(rel_prefix=rel_prefix, match=match) + or self._process_href_regex_user(rel_prefix=rel_prefix, match=match) + or self._process_href_regex_kind( + href=href, rel_prefix=rel_prefix, match=match + ) + ) + if res is None: + raise Exception("Unsupported match in _process_href_regex") + return res + + def _process_youtube(self, match, rel_prefix): + return ( + f'' + f"" + ) + + def _process_bgdimgurl(self, match, rel_prefix): + return ( + f"background-image:url({match.group('quote1')}{rel_prefix}" + f"{self.get_image_path(match.group('bgdimgurl'))}" + f"{match.group('quote2')})" + ) + + def _process_video(self): + return "

Video not scrapped

" + + def _process_iframe(self, match, rel_prefix): + return ( + f'External content' + ) + + def _process_gbl_regex(self, match, rel_prefix): + if match.group("image_url"): + return ( + f" 0: + return user["username"] + if user["unique_username"] and len(user["unique_username"]) > 0: + return f"@{user['unique_username']}" + return "Anonymous" diff --git a/src/ifixit2zim/scraper.py b/src/ifixit2zim/scraper.py new file mode 100644 index 0000000..9f7bb2e --- /dev/null +++ b/src/ifixit2zim/scraper.py @@ -0,0 +1,500 @@ +import datetime +import io +import json +import pathlib +import shutil +import threading + +from jinja2 import Environment, FileSystemLoader, select_autoescape +from schedule import every +from zimscraperlib.image.transformation import resize_image +from zimscraperlib.inputs import compute_descriptions +from zimscraperlib.zim.creator import Creator + +from ifixit2zim.constants import ( + DEFAULT_HOMEPAGE, + ROOT_DIR, + TITLE, + Configuration, +) +from ifixit2zim.context import Context +from ifixit2zim.exceptions import CategoryHomePageContentError +from ifixit2zim.executor import Executor +from ifixit2zim.imager import Imager +from ifixit2zim.processor import Processor +from ifixit2zim.scraper_category import ScraperCategory +from ifixit2zim.scraper_guide import ScraperGuide +from ifixit2zim.scraper_homepage import ScraperHomepage +from ifixit2zim.scraper_info import ScraperInfo +from ifixit2zim.scraper_user import ScraperUser +from ifixit2zim.shared import logger +from ifixit2zim.utils import Utils + +LOCALE_LOCK = threading.Lock() + + +class IFixit2Zim: + def __init__(self, **kwargs): + self.configuration = Configuration(**kwargs) + for option in self.configuration.required: + if getattr(self.configuration, option) is None: + raise ValueError(f"Missing parameter `{option}`") + + self.lock = threading.Lock() + + self.utils = Utils(configuration=self.configuration) + + @property + def build_path(self): + return self.configuration.build_path + + def cleanup(self): + """Remove temp files and release resources before exiting""" + if not self.configuration.keep_build_dir: + logger.debug(f"Removing {self.build_path}") + shutil.rmtree(self.build_path, ignore_errors=True) + + def sanitize_inputs(self): + """input & metadata sanitation""" + logger.debug("Checking user-provided metadata") + + if not self.configuration.name: + is_selection = ( + self.configuration.categories + or self.configuration.guides + or self.configuration.infos + or self.configuration.no_category + or self.configuration.no_guide + or self.configuration.no_info + ) + self.configuration.name = "ifixit_{lang}_{selection}".format( + lang=self.configuration.language["iso-639-1"], + selection="selection" if is_selection else "all", + ) + + period = datetime.datetime.now(tz=datetime.UTC).strftime("%Y-%m") + if self.configuration.fname: + # make sure we were given a filename and not a path + self.configuration.fpath = pathlib.Path( + self.configuration.fname.format(period=period) + ) + if pathlib.Path(self.configuration.fpath.name) != self.configuration.fpath: + raise ValueError( + f"filename is not a filename: {self.configuration.fname}" + ) + else: + self.configuration.fpath = pathlib.Path( + f"{self.configuration.name}_{period}.zim" + ) + + if not self.configuration.title: + # Try to grab title in selected language, otherwise use title in English + # Logic is a bit complex because we need the title for the selected + # language in the selected language, or fallback to the title for the + # selected language in English. + if ( + self.configuration.lang_code in TITLE + and f"title_{self.configuration.lang_code}" + in TITLE[self.configuration.lang_code] + ): + self.configuration.title = TITLE[self.configuration.lang_code][ + f"title_{self.configuration.lang_code}" + ] + else: + self.configuration.title = TITLE["en"][ + f"title_{self.configuration.lang_code}" + ] + self.configuration.title = self.metadata["title"] + self.configuration.title = self.configuration.title.strip() + + ( + self.configuration.description, + self.configuration.long_description, + ) = compute_descriptions( + self.metadata["description"], + self.configuration.description, + self.configuration.long_description, + ) + + self.configuration.author = self.configuration.author.strip() + + self.configuration.publisher = self.configuration.publisher.strip() + + self.configuration.tag = list( + { + *self.configuration.tag, + "_category:iFixit", + "iFixit", + "_videos:yes", + "_pictures:yes", + } + ) + + logger.debug( + "Configuration after sanitization:\n" + f"name: {self.configuration.name}\n" + f"fname: {self.configuration.fname}\n" + f"author: {self.configuration.author}\n" + f"publisher: {self.configuration.publisher}" + ) + + def add_assets(self): + """download and add site-wide assets, identified in metadata step""" + logger.info("Adding assets") + + # recursively add our assets, at a path identical to position in repo + assets_root = pathlib.Path(ROOT_DIR.joinpath("assets")) + for fpath in assets_root.glob("**/*"): + if not fpath.is_file(): + continue + path = str(fpath.relative_to(ROOT_DIR)) + + logger.debug(f"> {path}") + with self.lock: + self.creator.add_item_for(path=path, fpath=fpath) + + def setup(self): + # order matters are there are references between them + + # images handled on a different queue. + # mostly network I/O to retrieve and/or upload image. + # if not in S3 bucket, convert/optimize webp image + # svg images, stored but not optimized + + self.img_executor = Executor( + queue_size=100, + nb_workers=50, + prefix="IMG-T-", + ) + + src_illus_fpath = pathlib.Path(ROOT_DIR.joinpath("assets", "illustration.png")) + dst = io.BytesIO() + resize_image( + src=src_illus_fpath, + dst=dst, + width=48, + height=48, + method="thumbnail", + ) + + self.creator = Creator( + filename=self.configuration.output_path / self.configuration.fpath, + main_path=DEFAULT_HOMEPAGE, + workaround_nocancel=False, + ).config_metadata( + Illustration_48x48_at_1=dst.getvalue(), + Language=self.configuration.language["iso-639-3"], + Title=self.configuration.title, + Description=self.configuration.description, + Creator=self.configuration.author, + Publisher=self.configuration.publisher, + Name=self.configuration.name, + Tags=";".join(self.configuration.tag), + Date=datetime.datetime.now(tz=datetime.UTC).date(), + ) + + self.imager = Imager( + lock=self.lock, + creator=self.creator, + img_executor=self.img_executor, + utils=self.utils, + configuration=self.configuration, + ) + + # jinja2 environment setup + self.env = Environment( + loader=FileSystemLoader(ROOT_DIR.joinpath("templates")), + autoescape=select_autoescape(), + ) + + def _raise_helper(msg): + raise Exception(msg) + + self.processor = Processor( + lock=self.lock, + configuration=self.configuration, + creator=self.creator, + imager=self.imager, + ) + + context = Context( + lock=self.lock, + configuration=self.configuration, + creator=self.creator, + utils=self.utils, + metadata=self.metadata, + env=self.env, + processor=self.processor, + ) + + self.scraper_homepage = ScraperHomepage(context=context) + self.scraper_guide = ScraperGuide(context=context) + self.scraper_category = ScraperCategory(context=context) + self.scraper_info = ScraperInfo(context=context) + self.scraper_user = ScraperUser(context=context) + self.scrapers = [ + self.scraper_homepage, + self.scraper_category, + self.scraper_guide, + self.scraper_info, + self.scraper_user, + ] + + self.processor.get_guide_link_from_props = ( + self.scraper_guide.get_guide_link_from_props + ) + self.processor.get_category_link_from_props = ( + self.scraper_category.get_category_link_from_props + ) + self.processor.get_info_link_from_props = ( + self.scraper_info.get_info_link_from_props + ) + self.processor.get_user_link_from_props = ( + self.scraper_user.get_user_link_from_props + ) + + self.env.filters["get_category_link_from_obj"] = ( + self.scraper_category.get_category_link_from_obj + ) + self.env.filters["get_category_link_from_props"] = ( + self.scraper_category.get_category_link_from_props + ) + self.env.filters["get_guide_link_from_obj"] = ( + self.scraper_guide.get_guide_link_from_obj + ) + self.env.filters["get_guide_link_from_props"] = ( + self.scraper_guide.get_guide_link_from_props + ) + self.env.filters["get_info_link_from_obj"] = ( + self.scraper_info.get_info_link_from_obj + ) + self.env.filters["get_info_link_from_props"] = ( + self.scraper_info.get_info_link_from_props + ) + self.env.filters["get_user_link_from_obj"] = ( + self.scraper_user.get_user_link_from_obj + ) + self.env.filters["get_user_link_from_props"] = ( + self.scraper_user.get_user_link_from_props + ) + self.env.filters["guides_in_progress"] = self.processor.guides_in_progress + self.env.filters["category_count_parts"] = self.processor.category_count_parts + self.env.filters["category_count_tools"] = self.processor.category_count_tools + self.env.filters["get_image_path"] = self.processor.get_image_path + self.env.filters["get_image_url"] = self.processor.get_image_url + self.env.filters["cleanup_rendered_content"] = ( + self.processor.cleanup_rendered_content + ) + self.env.filters["get_timestamp_day_rendered"] = ( + self.processor.get_timestamp_day_rendered + ) + self.env.filters["get_item_comments_count"] = ( + self.processor.get_item_comments_count + ) + self.env.filters["get_guide_total_comments_count"] = ( + self.processor.get_guide_total_comments_count + ) + self.env.filters["get_user_display_name"] = self.processor.get_user_display_name + self.env.globals["raise"] = _raise_helper + self.env.globals["str"] = lambda x: str(x) + + for scraper in self.scrapers: + scraper.setup() + + def run(self): + # first report => creates a file with appropriate structure + self.report_progress() + + s3_storage = ( + self.utils.setup_s3_and_check_credentials( + self.configuration.s3_url_with_credentials + ) + if self.configuration.s3_url_with_credentials + else None + ) + s3_msg = ( + f"\n" + f" using cache: {s3_storage.url.netloc} " + f"with bucket: {s3_storage.bucket_name}" + if s3_storage + else "" + ) + del s3_storage + + logger.info( + f"Starting scraper with:\n" + f" language: {self.configuration.language['english']}" + f" ({self.configuration.domain})\n" + f" output: {self.configuration.output_path}\n" + f" build: {self.build_path}\n" + f"{s3_msg}" + ) + + self.metadata = self.get_online_metadata() + logger.debug( + f"Additional metadata scrapped online:\n" + f"title: {self.metadata['title']}\n" + f"description: {self.metadata['description']}\n" + f"stats: {self.metadata['stats']}\n" + ) + self.sanitize_inputs() + + logger.debug("Starting Zim creation") + self.setup() + self.creator.start() + + try: + self.add_assets() + + for scraper in self.scrapers: + scraper.build_expected_items() + self.report_progress() + + # set a timer to report progress only every 10 seconds, not need to do it + # after every item scrapped + every(10).seconds.do(self.report_progress) + + while True: + for scraper in self.scrapers: + scraper.scrape_items() + needs_rerun = False + if not self.configuration.scrape_only_first_items: + for scraper in self.scrapers: + if not scraper.items_queue.empty(): + needs_rerun = True + if not needs_rerun: + break + + logger.info("Awaiting images") + self.img_executor.shutdown() + + self.report_progress() + + stats = "Stats: " + for scraper in self.scrapers: + stats += ( + f"{len(scraper.expected_items_keys)} {scraper.get_items_name()}, " + ) + for scraper in self.scrapers: + stats += ( + f"{len(scraper.missing_items_keys)} missing" + f" {scraper.get_items_name()}, " + ) + for scraper in self.scrapers: + stats += ( + f"{len(scraper.error_items_keys)} {scraper.get_items_name()}" + " in error, " + ) + stats += f"{len(self.imager.handled)} images" + + logger.info(stats) + + logger.info("Null categories:") + for key in self.processor.null_categories: + logger.info(f"\t{key}") + + logger.info("IFIXIT_EXTERNAL URLS:") + for exturl in sorted(self.processor.ifixit_external_content): + logger.info(f"\t{exturl}") + + except Exception as exc: + # request Creator not to create a ZIM file on finish + self.creator.can_finish = False + if isinstance(exc, KeyboardInterrupt): + logger.error("KeyboardInterrupt, exiting.") + else: + logger.error(f"Interrupting process due to error: {exc}") + logger.exception(exc) + self.imager.abort() + self.img_executor.shutdown(wait=False) + return 1 + else: + if self.creator.can_finish: + logger.info("Finishing ZIM file") + with self.lock: + self.creator.finish() + logger.info( + f"Finished Zim {self.creator.filename.name} " + f"in {self.creator.filename.parent}" + ) + finally: + logger.info("Cleaning up") + with self.lock: + self.cleanup() + + logger.info("Scraper has finished normally") + + def report_progress(self): + if not self.configuration.stats_path: + return + done = 0 + total = 0 + for scraper in self.scrapers: + scraper_total = len(scraper.expected_items_keys) + len( + scraper.unexpected_items_keys + ) + scraper_remains = scraper.items_queue.qsize() + scraper_done = scraper_total - scraper_remains + total += scraper_total + done += scraper_done + progress = { + "done": done, + "total": total, + } + with open(self.configuration.stats_path, "w") as outfile: + json.dump(progress, outfile, indent=2) + + def get_online_metadata(self): + """metadata from online website, looking at homepage source code""" + logger.info("Fetching website metadata") + + soup, _ = self.utils.get_soup("/") + + return { + "title": soup.find( + "title" + ).string, # pyright: ignore[reportAttributeAccessIssue, reportOptionalMemberAccess] + "description": soup.find( + "meta", attrs={"name": "description"} + ).attrs.get( # pyright: ignore[reportAttributeAccessIssue, reportOptionalMemberAccess] + "content" + ), + "stats": self._extract_stats_from_page(soup), + "current_year": datetime.datetime.now(tz=datetime.UTC).year, + } + + def _extract_stats_from_page(self, soup): + results = soup.findAll("div", {"data-name": "KPIDisplay"}) + if len(results) == 0: + raise CategoryHomePageContentError("No KPIs found") + if len(results) > 1: + raise CategoryHomePageContentError("Too many KPIs found") + kpi = results[0].get("data-props") + if kpi is None: + raise CategoryHomePageContentError("KPIs not found in data-props") + + try: + kpi_d = json.loads(kpi) + except json.decoder.JSONDecodeError as e: + raise CategoryHomePageContentError( + f"Failed to decode stats from '{kpi}' to integer" + ) from e + + if "stats" not in kpi_d: + raise CategoryHomePageContentError(f"Stats not found in KPIs '{kpi}'") + + stats = kpi_d["stats"] + + if len(stats) == 0: + raise CategoryHomePageContentError("Stats array is empty") + for stat in stats: + if "value" not in stat: + raise CategoryHomePageContentError( + f"No value found in stat '{json.dumps(stat)}'" + ) + if "label" not in stat: + raise CategoryHomePageContentError( + f"No label found in stat '{json.dumps(stat)}'" + ) + + return stats diff --git a/ifixit2zim/scraper_category.py b/src/ifixit2zim/scraper_category.py similarity index 67% rename from ifixit2zim/scraper_category.py rename to src/ifixit2zim/scraper_category.py index 281e9a8..ab1fcee 100644 --- a/ifixit2zim/scraper_category.py +++ b/src/ifixit2zim/scraper_category.py @@ -1,18 +1,18 @@ -import urllib +import urllib.parse -from .constants import CATEGORY_LABELS, URLS -from .exceptions import UnexpectedDataKindException -from .scraper_generic import ScraperGeneric -from .shared import Global, logger -from .utils import get_api_content +from ifixit2zim.constants import CATEGORY_LABELS, URLS +from ifixit2zim.context import Context +from ifixit2zim.exceptions import UnexpectedDataKindExceptionError +from ifixit2zim.scraper_generic import ScraperGeneric +from ifixit2zim.shared import logger class ScraperCategory(ScraperGeneric): - def __init__(self): - super().__init__() + def __init__(self, context: Context): + super().__init__(context) def setup(self): - self.category_template = Global.env.get_template("category.html") + self.category_template = self.env.get_template("category.html") def get_items_name(self): return "category" @@ -27,19 +27,19 @@ def _add_category_to_scrape(self, category_key, category_title, is_expected): ) def _get_category_key_from_title(self, category_title): - return Global.convert_title_to_filename(category_title.lower()) + return self.processor.convert_title_to_filename(category_title.lower()) def _build_category_path(self, category_title): href = ( - Global.conf.main_url.geturl() + self.configuration.main_url.geturl() + f"/Device/{category_title.replace('/', ' ')}" ) - final_href = Global.normalize_href(href) + final_href = self.processor.normalize_href(href) return final_href[1:] def get_category_link_from_obj(self, category): if "title" not in category or not category["title"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( f"Impossible to extract category title from {category}" ) category_title = category["title"] @@ -47,12 +47,12 @@ def get_category_link_from_obj(self, category): def get_category_link_from_props(self, category_title): category_path = urllib.parse.quote(self._build_category_path(category_title)) - if Global.conf.no_category: + if self.configuration.no_category: return f"home/not_scrapped?url={category_path}" category_key = self._get_category_key_from_title(category_title) - if Global.conf.categories: + if self.configuration.categories: is_not_included = True - for other_category in Global.conf.categories: + for other_category in self.configuration.categories: other_category_key = self._get_category_key_from_title(other_category) if other_category_key == category_key: is_not_included = False @@ -69,39 +69,41 @@ def _process_categories(self, categories): self._process_categories(categories[category]) def build_expected_items(self): - if Global.conf.no_category: + if self.configuration.no_category: logger.info("No category required") return - if Global.conf.categories: + if self.configuration.categories: logger.info("Adding required categories as expected") - for category in Global.conf.categories: + for category in self.configuration.categories: category_key = self._get_category_key_from_title(category) self._add_category_to_scrape(category_key, category, True) return logger.info("Downloading list of categories") - categories = get_api_content("/categories", includeStubs=True) + categories = self.utils.get_api_content("/categories", includeStubs=True) self._process_categories(categories) - logger.info("{} categories found".format(len(self.expected_items_keys))) + logger.info(f"{len(self.expected_items_keys)} categories found") - def get_one_item_content(self, item_key, item_data): + def get_one_item_content(self, item_key, item_data): # noqa ARG002 categoryid = item_key - category_content = get_api_content( - f"/wikis/CATEGORY/{categoryid}", langid=Global.conf.lang_code + category_content = self.utils.get_api_content( + f"/wikis/CATEGORY/{categoryid}", langid=self.configuration.lang_code ) if category_content and category_content["revisionid"] > 0: return category_content logger.warning("Falling back to category in EN") - category_content = get_api_content(f"/wikis/CATEGORY/{categoryid}", langid="en") + category_content = self.utils.get_api_content( + f"/wikis/CATEGORY/{categoryid}", langid="en" + ) if category_content and category_content["revisionid"] > 0: return category_content for lang in URLS.keys(): logger.warning(f"Falling back to category in {lang}") - category_content = get_api_content( + category_content = self.utils.get_api_content( f"/wikis/CATEGORY/{categoryid}", langid=lang ) @@ -109,28 +111,28 @@ def get_one_item_content(self, item_key, item_data): return category_content logger.warning(f"Impossible to get category content: {item_key}") - Global.null_categories.add(item_key) + self.processor.null_categories.add(item_key) return None - def add_item_redirect(self, item_key, item_data, redirect_kind): + def add_item_redirect(self, item_key, item_data, redirect_kind): # noqa ARG002 path = self._build_category_path(item_data["category_title"]) - Global.add_redirect( + self.processor.add_redirect( path=path, target_path=f"home/{redirect_kind}?{urllib.parse.urlencode({'url':path})}", ) - def process_one_item(self, item_key, item_data, item_content): + def process_one_item(self, item_key, item_data, item_content): # noqa ARG002 category_content = item_content category_rendered = self.category_template.render( category=category_content, - label=CATEGORY_LABELS[Global.conf.lang_code], - metadata=Global.metadata, - lang=Global.conf.lang_code, + label=CATEGORY_LABELS[self.configuration.lang_code], + metadata=self.metadata, + lang=self.configuration.lang_code, ) - Global.add_html_item( + self.processor.add_html_item( path=self._build_category_path(category_title=category_content["title"]), title=category_content["display_title"], content=category_rendered, diff --git a/ifixit2zim/scraper_generic.py b/src/ifixit2zim/scraper_generic.py similarity index 77% rename from ifixit2zim/scraper_generic.py rename to src/ifixit2zim/scraper_generic.py index b9e4bb4..7a6e117 100644 --- a/ifixit2zim/scraper_generic.py +++ b/src/ifixit2zim/scraper_generic.py @@ -4,18 +4,50 @@ from schedule import run_pending -from .exceptions import FinalScrapingFailure -from .shared import Global, logger +from ifixit2zim.context import Context +from ifixit2zim.exceptions import FinalScrapingFailureError +from ifixit2zim.shared import logger + +FIRST_ITEMS_COUNT = 5 class ScraperGeneric(ABC): - def __init__(self): - self.expected_items_keys = dict() - self.unexpected_items_keys = dict() + def __init__(self, context: Context): + self.context = context + self.expected_items_keys = {} + self.unexpected_items_keys = {} self.items_queue = Queue() self.missing_items_keys = set() self.error_items_keys = set() + @property + def configuration(self): + return self.context.configuration + + @property + def utils(self): + return self.context.utils + + @property + def metadata(self): + return self.context.metadata + + @property + def env(self): + return self.context.env + + @property + def lock(self): + return self.context.lock + + @property + def creator(self): + return self.context.creator + + @property + def processor(self): + return self.context.processor + @abstractmethod def setup(self): pass @@ -41,7 +73,7 @@ def process_one_item(self, item_key, item_data, item_content): pass def add_item_to_scrape( - self, item_key, item_data, is_expected, warn_unexpected=True + self, item_key, item_data, is_expected, *, warn_unexpected=True ): item_key = str(item_key) # just in case it's an int if ( @@ -80,7 +112,6 @@ def add_item_error_redirect(self, item_key, item_data): pass # ignore exceptions, we are already inside an exception handling def scrape_one_item(self, item_key, item_data): - item_content = self.get_one_item_content(item_key, item_data) if item_content is None: @@ -94,7 +125,6 @@ def scrape_one_item(self, item_key, item_data): self.process_one_item(item_key, item_data, item_content) def scrape_items(self): - logger.info( f"Scraping {self.get_items_name()} items ({self.items_queue.qsize()}" " items remaining)" @@ -103,7 +133,10 @@ def scrape_items(self): num_items = 1 while not self.items_queue.empty(): run_pending() - if Global.conf.scrape_only_first_items and num_items > 5: + if ( + self.configuration.scrape_only_first_items + and num_items > FIRST_ITEMS_COUNT + ): break item = self.items_queue.get(block=False) item_key = item["key"] @@ -126,9 +159,9 @@ def scrape_items(self): len(self.missing_items_keys) * 100 / (len(self.expected_items_keys) + len(self.unexpected_items_keys)) - > Global.conf.max_missing_items_percent + > self.configuration.max_missing_items_percent ): - raise FinalScrapingFailure( + raise FinalScrapingFailureError( f"Too many {self.get_items_name()}s found missing: " f"{len(self.missing_items_keys)}" ) @@ -136,9 +169,9 @@ def scrape_items(self): len(self.error_items_keys) * 100 / (len(self.expected_items_keys) + len(self.unexpected_items_keys)) - > Global.conf.max_error_items_percent + > self.configuration.max_error_items_percent ): - raise FinalScrapingFailure( + raise FinalScrapingFailureError( f"Too many {self.get_items_name()}s failed to be processed: " f"{len(self.error_items_keys)}" ) diff --git a/ifixit2zim/scraper_guide.py b/src/ifixit2zim/scraper_guide.py similarity index 79% rename from ifixit2zim/scraper_guide.py rename to src/ifixit2zim/scraper_guide.py index 0a5089b..7ab690c 100644 --- a/ifixit2zim/scraper_guide.py +++ b/src/ifixit2zim/scraper_guide.py @@ -1,6 +1,6 @@ -import urllib +import urllib.parse -from .constants import ( +from ifixit2zim.constants import ( DIFFICULTY_EASY, DIFFICULTY_HARD, DIFFICULTY_MODERATE, @@ -10,18 +10,18 @@ UNKNOWN_LOCALE, UNKNOWN_TITLE, ) -from .exceptions import UnexpectedDataKindException -from .scraper_generic import ScraperGeneric -from .shared import Global, logger -from .utils import get_api_content +from ifixit2zim.context import Context +from ifixit2zim.exceptions import UnexpectedDataKindExceptionError +from ifixit2zim.scraper_generic import ScraperGeneric +from ifixit2zim.shared import logger class ScraperGuide(ScraperGeneric): - def __init__(self): - super().__init__() + def __init__(self, context: Context): + super().__init__(context) def setup(self): - self.guide_template = Global.env.get_template("guide.html") + self.guide_template = self.env.get_template("guide.html") def get_items_name(self): return "guide" @@ -37,22 +37,22 @@ def _add_guide_to_scrape(self, guideid, guidetitle, locale, is_expected): is_expected, ) - def _build_guide_path(self, guideid, guidetitle): - href = Global.conf.main_url.geturl() + f"/Guide/-/{guideid}" - final_href = Global.normalize_href(href) + def _build_guide_path(self, guideid, guidetitle): # noqa ARG002 + href = self.configuration.main_url.geturl() + f"/Guide/-/{guideid}" + final_href = self.processor.normalize_href(href) return final_href[1:] def get_guide_link_from_obj(self, guide): if "guideid" not in guide or not guide["guideid"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( f"Impossible to extract guide id from {guide}" ) if "locale" not in guide or not guide["locale"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( f"Impossible to extract guide locale from {guide}" ) if "title" not in guide or not guide["title"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( f"Impossible to extract guide title from {guide}" ) guideid = guide["guideid"] @@ -80,28 +80,28 @@ def get_guide_link_from_props( guide_path = urllib.parse.quote( self._build_guide_path(guideid=guideid, guidetitle=guidetitle) ) - if Global.conf.no_guide: + if self.configuration.no_guide: return f"home/not_scrapped?url={guide_path}" - if Global.conf.guides and str(guideid) not in Global.conf.guides: + if self.configuration.guides and str(guideid) not in self.configuration.guides: return f"home/not_scrapped?url={guide_path}" self._add_guide_to_scrape(guideid, guidetitle, guidelocale, False) return guide_path def build_expected_items(self): - if Global.conf.no_guide: + if self.configuration.no_guide: logger.info("No guide required") return - if Global.conf.guides: + if self.configuration.guides: logger.info("Adding required guides as expected") - for guide in Global.conf.guides: + for guide in self.configuration.guides: self._add_guide_to_scrape(guide, UNKNOWN_TITLE, UNKNOWN_LOCALE, True) return logger.info("Downloading list of guides") limit = 200 offset = 0 while True: - guides = get_api_content("/guides", limit=limit, offset=offset) - if len(guides) == 0: + guides = self.utils.get_api_content("/guides", limit=limit, offset=offset) + if not guides or len(guides) == 0: break for guide in guides: # we ignore archived guides since they are not accessible anywayß @@ -114,27 +114,29 @@ def build_expected_items(self): # on this endpoint, so we consider it as unknown for now self._add_guide_to_scrape(guideid, UNKNOWN_TITLE, UNKNOWN_LOCALE, True) offset += limit - if Global.conf.scrape_only_first_items: + if self.configuration.scrape_only_first_items: logger.warning( "Aborting the retrieval of all guides since only first items" " will be scraped anyway" ) break - logger.info("{} guides found".format(len(self.expected_items_keys))) + logger.info(f"{len(self.expected_items_keys)} guides found") def get_one_item_content(self, item_key, item_data): guideid = item_key guide = item_data locale = guide["locale"] if locale == UNKNOWN_LOCALE: - locale = Global.conf.lang_code # fallback value + locale = self.configuration.lang_code # fallback value if locale == "ja": locale = "jp" # Unusual iFixit convention - guide_content = get_api_content(f"/guides/{guideid}", langid=locale) + guide_content = self.utils.get_api_content(f"/guides/{guideid}", langid=locale) if guide_content is None and locale != "en": # guide is most probably available in English anyway - guide_content = get_api_content(f"/guides/{guideid}", langid="en") + guide_content = self.utils.get_api_content( + f"/guides/{guideid}", langid="en" + ) return guide_content @@ -146,12 +148,12 @@ def add_item_redirect(self, item_key, item_data, redirect_kind): logger.warning(f"Cannot add redirect for guide {guideid} in error") return path = self._build_guide_path(guideid, guidetitle) - Global.add_redirect( + self.processor.add_redirect( path=path, target_path=f"home/{redirect_kind}?{urllib.parse.urlencode({'url':path})}", ) - def process_one_item(self, item_key, item_data, item_content): + def process_one_item(self, item_key, item_data, item_content): # noqa ARG002 guide_content = item_content if guide_content["type"] != "teardown": @@ -166,7 +168,7 @@ def process_one_item(self, item_key, item_data, item_content): elif guide_content["difficulty"] in DIFFICULTY_VERY_HARD: guide_content["difficulty_class"] = "difficulty-5" else: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Unknown guide difficulty: '{}' in guide {}".format( guide_content["difficulty"], guide_content["guideid"], @@ -175,7 +177,7 @@ def process_one_item(self, item_key, item_data, item_content): for step in guide_content["steps"]: if not step["media"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Missing media attribute in step {} of guide {}".format( step["stepid"], guide_content["guideid"] ) @@ -185,14 +187,14 @@ def process_one_item(self, item_key, item_data, item_content): "video", "embed", ]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Unrecognized media type in step {} of guide {}".format( step["stepid"], guide_content["guideid"] ) ) if step["media"]["type"] == "video": if "data" not in step["media"] or not step["media"]["data"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Missing 'data' in step {} of guide {}".format( step["stepid"], guide_content["guideid"] ) @@ -201,7 +203,7 @@ def process_one_item(self, item_key, item_data, item_content): "image" not in step["media"]["data"] or not step["media"]["data"]["image"] ): - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Missing outer 'image' in step {} of guide {}".format( step["stepid"], guide_content["guideid"] ) @@ -210,14 +212,14 @@ def process_one_item(self, item_key, item_data, item_content): "image" not in step["media"]["data"]["image"] or not step["media"]["data"]["image"]["image"] ): - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Missing inner 'image' in step {} of guide {}".format( step["stepid"], guide_content["guideid"] ) ) if step["media"]["type"] == "embed": if "data" not in step["media"] or not step["media"]["data"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Missing 'data' in step {} of guide {}".format( step["stepid"], guide_content["guideid"] ) @@ -226,13 +228,13 @@ def process_one_item(self, item_key, item_data, item_content): "html" not in step["media"]["data"] or not step["media"]["data"]["html"] ): - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Missing 'html' in step {} of guide {}".format( step["stepid"], guide_content["guideid"] ) ) for line in step["lines"]: - if not line["bullet"] in [ + if line["bullet"] not in [ "black", "red", "orange", @@ -246,7 +248,7 @@ def process_one_item(self, item_key, item_data, item_content): "icon_caution", "icon_reminder", ]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( "Unrecognized bullet '{}' in step {} of guide {}".format( line["bullet"], step["stepid"], @@ -255,11 +257,11 @@ def process_one_item(self, item_key, item_data, item_content): ) guide_rendered = self.guide_template.render( guide=guide_content, - label=GUIDE_LABELS[Global.conf.lang_code], - metadata=Global.metadata, + label=GUIDE_LABELS[self.configuration.lang_code], + metadata=self.metadata, ) - Global.add_html_item( + self.processor.add_html_item( path=self._build_guide_path( guideid=guide_content["guideid"], guidetitle=guide_content["title"] ), diff --git a/ifixit2zim/scraper_homepage.py b/src/ifixit2zim/scraper_homepage.py similarity index 78% rename from ifixit2zim/scraper_homepage.py rename to src/ifixit2zim/scraper_homepage.py index bc0db53..ea40679 100644 --- a/ifixit2zim/scraper_homepage.py +++ b/src/ifixit2zim/scraper_homepage.py @@ -1,23 +1,20 @@ -# -*- coding: utf-8 -*- - -import datetime import json import re -from .constants import DEFAULT_HOMEPAGE, HOME_LABELS -from .exceptions import CategoryHomePageContentError -from .scraper_generic import ScraperGeneric -from .shared import Global, logger -from .utils import get_soup +from ifixit2zim.constants import DEFAULT_HOMEPAGE, HOME_LABELS +from ifixit2zim.context import Context +from ifixit2zim.exceptions import CategoryHomePageContentError +from ifixit2zim.scraper_generic import ScraperGeneric +from ifixit2zim.shared import logger class ScraperHomepage(ScraperGeneric): - def __init__(self): - super().__init__() + def __init__(self, context: Context): + super().__init__(context) def setup(self): - self.homepage_template = Global.env.get_template("home.html") - self.not_here_template = Global.env.get_template("not_here.html") + self.homepage_template = self.env.get_template("home.html") + self.not_here_template = self.env.get_template("not_here.html") def get_items_name(self): return "home" @@ -25,15 +22,14 @@ def get_items_name(self): def build_expected_items(self): self.add_item_to_scrape(1, 1, True) - def get_one_item_content(self, item_key, item_data): - soup, _ = get_soup("/Guide") + def get_one_item_content(self, item_key, item_data): # noqa ARG002 + soup, _ = self.utils.get_soup("/Guide") return soup - def add_item_redirect(self, item_key, item_data, redirect_kind): + def add_item_redirect(self, item_key, item_data, redirect_kind): # noqa ARG002 logger.warning("Not supposed to add a redirect for a home item") - return - def process_one_item(self, item_key, item_data, item_content): + def process_one_item(self, item_key, item_data, item_content): # noqa ARG002 soup = item_content # extract and clean main content @@ -48,99 +44,102 @@ def process_one_item(self, item_key, item_data, item_content): } logger.debug( - "Content extracted from /Guide:\n" f"{json.dumps(home_content,indent=2)}" + f"Content extracted from /Guide:\n {json.dumps(home_content,indent=2)}" ) homepage = self.homepage_template.render( home_content=home_content, - metadata=Global.metadata, - label=HOME_LABELS[Global.conf.lang_code], + metadata=self.metadata, + label=HOME_LABELS[self.configuration.lang_code], ) not_scrapped = self.not_here_template.render( - metadata=Global.metadata, + metadata=self.metadata, kind="not_scrapped", ) external_content = self.not_here_template.render( - metadata=Global.metadata, + metadata=self.metadata, kind="external_content", ) unavailable_offline = self.not_here_template.render( - metadata=Global.metadata, + metadata=self.metadata, kind="unavailable_offline", ) not_yet_available = self.not_here_template.render( - metadata=Global.metadata, + metadata=self.metadata, kind="not_yet_available", ) missing = self.not_here_template.render( - metadata=Global.metadata, + metadata=self.metadata, kind="missing", ) error_content = self.not_here_template.render( - metadata=Global.metadata, + metadata=self.metadata, kind="error", ) - with Global.lock: - Global.creator.add_item_for( + with self.lock: + if not self.creator: + raise Exception("Please set creator first") + + self.creator.add_item_for( path="home/home", - title=Global.conf.title, + title=self.configuration.title, content=homepage, mimetype="text/html", is_front=True, ) - Global.creator.add_redirect(path=DEFAULT_HOMEPAGE, target_path="home/home") + self.creator.add_redirect(path=DEFAULT_HOMEPAGE, target_path="home/home") - Global.creator.add_item_for( + self.creator.add_item_for( path="home/not_scrapped", - title=Global.conf.title, + title=self.configuration.title, content=not_scrapped, mimetype="text/html", is_front=False, ) - Global.creator.add_item_for( + self.creator.add_item_for( path="home/external_content", - title=Global.conf.title, + title=self.configuration.title, content=external_content, mimetype="text/html", is_front=False, ) - Global.creator.add_item_for( + self.creator.add_item_for( path="home/unavailable_offline", - title=Global.conf.title, + title=self.configuration.title, content=unavailable_offline, mimetype="text/html", is_front=False, ) - Global.creator.add_item_for( + self.creator.add_item_for( path="home/not_yet_available", - title=Global.conf.title, + title=self.configuration.title, content=not_yet_available, mimetype="text/html", is_front=False, ) - Global.creator.add_item_for( + self.creator.add_item_for( path="home/missing", - title=Global.conf.title, + title=self.configuration.title, content=missing, mimetype="text/html", is_front=False, ) - Global.creator.add_item_for( + self.creator.add_item_for( path="home/error", - title=Global.conf.title, + title=self.configuration.title, content=error_content, mimetype="text/html", is_front=False, @@ -153,16 +152,16 @@ def _extract_page_title_from_page(self, soup): p = soup.select(page_title_selector) if len(p) == 0: raise CategoryHomePageContentError( - "No text found in page with selector " f"'{page_title_selector}'" + f"No text found in page with selector '{page_title_selector}'" ) if len(p) > 1: raise CategoryHomePageContentError( - "Too many text found in page with selector " f"'{page_title_selector}'" + f"Too many text found in page with selector '{page_title_selector}'" ) text = p[0].text if len(text) == 0: raise CategoryHomePageContentError( - "Empty text found in page with selector " f"'{page_title_selector}'" + f"Empty text found in page with selector '{page_title_selector}'" ) return text @@ -171,7 +170,7 @@ def _extract_primary_title_from_page(self, soup): p = soup.select(primary_title_selector) if len(p) == 0: raise CategoryHomePageContentError( - "No text found in page with selector " f"'{primary_title_selector}'" + f"No text found in page with selector '{primary_title_selector}'" ) if len(p) > 1: raise CategoryHomePageContentError( @@ -181,7 +180,7 @@ def _extract_primary_title_from_page(self, soup): text = p[0].text if len(text) == 0: raise CategoryHomePageContentError( - "Empty text found in page with selector " f"'{primary_title_selector}'" + f"Empty text found in page with selector '{primary_title_selector}'" ) return text @@ -190,7 +189,7 @@ def _extract_secondary_title_from_page(self, soup): p = soup.select(secondary_title_selector) if len(p) == 0: raise CategoryHomePageContentError( - "No text found in page with selector " f"'{secondary_title_selector}'" + f"No text found in page with selector '{secondary_title_selector}'" ) if len(p) > 1: raise CategoryHomePageContentError( @@ -398,8 +397,8 @@ def _extract_count_from_sub_category(self, sc): return int(text) except ValueError: raise CategoryHomePageContentError( - f"Failed to convert span text '{text}' to integer for " "sub-category" - ) + f"Failed to convert span text '{text}' to integer for sub-category" + ) from None def _extract_title_from_sub_category(self, sc): sub_category_img_css_selector = "span.overflow-slide-in" @@ -422,50 +421,12 @@ def _extract_title_from_sub_category(self, sc): ) return title - def _extract_stats_from_page(self, soup): - results = soup.findAll("div", {"data-name": "KPIDisplay"}) - if len(results) == 0: - raise CategoryHomePageContentError("No KPIs found") - if len(results) > 1: - raise CategoryHomePageContentError("Too many KPIs found") - kpi = results[0].get("data-props") - if kpi is None: - raise CategoryHomePageContentError("KPIs not found in data-props") - - try: - kpi_d = json.loads(kpi) - except json.decoder.JSONDecodeError as e: - raise CategoryHomePageContentError( - "Failed to decode stats from '{}' to integer for stat {}".format(kpi, e) - ) - - if "stats" not in kpi_d: - raise CategoryHomePageContentError( - "Stats not found in KPIs '{}'".format(kpi) - ) - - stats = kpi_d["stats"] - - if len(stats) == 0: - raise CategoryHomePageContentError("Stats array is empty") - for stat in stats: - if "value" not in stat: - raise CategoryHomePageContentError( - "No value found in stat '{}'".format(json.dump(stat)) - ) - if "label" not in stat: - raise CategoryHomePageContentError( - "No label found in stat '{}'".format(json.dump(stat)) - ) - - return stats - def _extract_details_from_single_stat(self, fs): stat_text_css_selector = "chakra-stat__help-text" p = fs.select(stat_text_css_selector) if len(p) == 0: raise CategoryHomePageContentError( - "No text found in stat with selector " f"'{stat_text_css_selector}'" + f"No text found in stat with selector '{stat_text_css_selector}'" ) if len(p) > 1: raise CategoryHomePageContentError( @@ -475,14 +436,14 @@ def _extract_details_from_single_stat(self, fs): stat_text = p[0].text if len(stat_text) == 0: raise CategoryHomePageContentError( - "Empty text found in stat with selector " f"'{stat_text_css_selector}'" + f"Empty text found in stat with selector '{stat_text_css_selector}'" ) stat_number_css_selector = "chakra-stat__number" p = fs.select(stat_number_css_selector) if len(p) == 0: raise CategoryHomePageContentError( - "No number found in stat with selector " f"'{stat_number_css_selector}'" + f"No number found in stat with selector '{stat_number_css_selector}'" ) if len(p) > 1: raise CategoryHomePageContentError( @@ -508,19 +469,4 @@ def _extract_details_from_single_stat(self, fs): except ValueError: raise CategoryHomePageContentError( f"Failed to convert text '{stat_number}' to integer for stat" - ) - - def get_online_metadata(self): - """metadata from online website, looking at homepage source code""" - logger.info("Fetching website metadata") - - soup, _ = get_soup("/") - - return { - "title": soup.find("title").string, - "description": soup.find("meta", attrs={"name": "description"}).attrs.get( - "content" - ), - "stats": self._extract_stats_from_page(soup), - "current_year": datetime.date.today().year, - } + ) from None diff --git a/ifixit2zim/scraper_info.py b/src/ifixit2zim/scraper_info.py similarity index 65% rename from ifixit2zim/scraper_info.py rename to src/ifixit2zim/scraper_info.py index 115df07..f846382 100644 --- a/ifixit2zim/scraper_info.py +++ b/src/ifixit2zim/scraper_info.py @@ -1,18 +1,18 @@ -import urllib +import urllib.parse -from .constants import UNAVAILABLE_OFFLINE_INFOS -from .exceptions import UnexpectedDataKindException -from .scraper_generic import ScraperGeneric -from .shared import Global, logger -from .utils import get_api_content +from ifixit2zim.constants import UNAVAILABLE_OFFLINE_INFOS +from ifixit2zim.context import Context +from ifixit2zim.exceptions import UnexpectedDataKindExceptionError +from ifixit2zim.scraper_generic import ScraperGeneric +from ifixit2zim.shared import logger class ScraperInfo(ScraperGeneric): - def __init__(self): - super().__init__() + def __init__(self, context: Context): + super().__init__(context) def setup(self): - self.info_template = Global.env.get_template("info.html") + self.info_template = self.env.get_template("info.html") def get_items_name(self): return "info" @@ -27,16 +27,19 @@ def _add_info_to_scrape(self, info_key, info_title, is_expected): ) def _get_info_key_from_title(self, info_title): - return Global.convert_title_to_filename(info_title.lower()) + return self.processor.convert_title_to_filename(info_title.lower()) def _build_info_path(self, info_title): - href = Global.conf.main_url.geturl() + f"/Info/{info_title.replace('/', ' ')}" - final_href = Global.normalize_href(href) + href = ( + self.configuration.main_url.geturl() + + f"/Info/{info_title.replace('/', ' ')}" + ) + final_href = self.processor.normalize_href(href) return final_href[1:] def get_info_link_from_obj(self, info): if "title" not in info or not info["title"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( f"Impossible to extract info title from {info}" ) info_title = info["title"] @@ -44,14 +47,14 @@ def get_info_link_from_obj(self, info): def get_info_link_from_props(self, info_title): info_path = urllib.parse.quote(self._build_info_path(info_title)) - if Global.conf.no_info: + if self.configuration.no_info: return f"home/not_scrapped?url={info_path}" if info_title in UNAVAILABLE_OFFLINE_INFOS: return f"home/unavailable_offline?url={info_path}" info_key = self._get_info_key_from_title(info_title) - if Global.conf.infos: + if self.configuration.infos: is_not_included = True - for other_info in Global.conf.infos: + for other_info in self.configuration.infos: other_info_key = self._get_info_key_from_title(other_info) if other_info_key == info_key: is_not_included = False @@ -61,12 +64,12 @@ def get_info_link_from_props(self, info_title): return info_path def build_expected_items(self): - if Global.conf.no_info: + if self.configuration.no_info: logger.info("No info required") return - if Global.conf.infos: + if self.configuration.infos: logger.info("Adding required infos as expected") - for info_title in Global.conf.infos: + for info_title in self.configuration.infos: info_key = self._get_info_key_from_title(info_title) self._add_info_to_scrape(info_key, info_title, True) return @@ -74,45 +77,47 @@ def build_expected_items(self): limit = 200 offset = 0 while True: - info_wikis = get_api_content("/wikis/INFO", limit=limit, offset=offset) - if len(info_wikis) == 0: + info_wikis = self.utils.get_api_content( + "/wikis/INFO", limit=limit, offset=offset + ) + if not info_wikis or len(info_wikis) == 0: break for info_wiki in info_wikis: info_title = info_wiki["title"] info_key = self._get_info_key_from_title(info_title) self._add_info_to_scrape(info_key, info_title, True) offset += limit - if Global.conf.scrape_only_first_items: + if self.configuration.scrape_only_first_items: logger.warning( "Aborting the retrieval of all infos since only first items" " will be scraped anyway" ) break - logger.info("{} info found".format(len(self.expected_items_keys))) + logger.info(f"{len(self.expected_items_keys)} info found") - def get_one_item_content(self, item_key, item_data): + def get_one_item_content(self, item_key, item_data): # noqa ARG002 info_wiki_title = item_key - info_wiki_content = get_api_content(f"/wikis/INFO/{info_wiki_title}") + info_wiki_content = self.utils.get_api_content(f"/wikis/INFO/{info_wiki_title}") return info_wiki_content - def add_item_redirect(self, item_key, item_data, redirect_kind): + def add_item_redirect(self, item_key, item_data, redirect_kind): # noqa ARG002 path = self._build_info_path(item_data["info_title"]) - Global.add_redirect( + self.processor.add_redirect( path=path, target_path=f"home/{redirect_kind}?{urllib.parse.urlencode({'url':path})}", ) - def process_one_item(self, item_key, item_data, item_content): + def process_one_item(self, item_key, item_data, item_content): # noqa ARG002 info_wiki_content = item_content info_wiki_rendered = self.info_template.render( info_wiki=info_wiki_content, # label=INFO_WIKI_LABELS[self.conf.lang_code], - metadata=Global.metadata, - lang=Global.conf.lang_code, + metadata=self.metadata, + lang=self.configuration.lang_code, ) - Global.add_html_item( + self.processor.add_html_item( path=self._build_info_path(info_wiki_content["title"]), title=info_wiki_content["display_title"], content=info_wiki_rendered, diff --git a/ifixit2zim/scraper_user.py b/src/ifixit2zim/scraper_user.py similarity index 76% rename from ifixit2zim/scraper_user.py rename to src/ifixit2zim/scraper_user.py index 46845cb..5d1cb78 100644 --- a/ifixit2zim/scraper_user.py +++ b/src/ifixit2zim/scraper_user.py @@ -1,19 +1,19 @@ -import urllib +import urllib.parse -from .constants import UNKNOWN_TITLE, USER_LABELS -from .exceptions import UnexpectedDataKindException -from .scraper_generic import ScraperGeneric -from .shared import Global, logger -from .utils import get_api_content +from ifixit2zim.constants import UNKNOWN_TITLE, USER_LABELS +from ifixit2zim.context import Context +from ifixit2zim.exceptions import UnexpectedDataKindExceptionError +from ifixit2zim.scraper_generic import ScraperGeneric +from ifixit2zim.shared import logger class ScraperUser(ScraperGeneric): - def __init__(self): - super().__init__() - self.user_id_to_titles = dict() + def __init__(self, context: Context): + super().__init__(context) + self.user_id_to_titles = {} def setup(self): - self.user_template = Global.env.get_template("user.html") + self.user_template = self.env.get_template("user.html") def get_items_name(self): return "user" @@ -26,7 +26,7 @@ def _add_user_to_scrape(self, userid, usertitle, is_expected): "usertitle": usertitle, }, is_expected, - False, + warn_unexpected=False, ) if userid in self.user_id_to_titles: self.user_id_to_titles[userid].append(usertitle) @@ -35,15 +35,15 @@ def _add_user_to_scrape(self, userid, usertitle, is_expected): def _build_user_path(self, userid, usertitle): href = ( - Global.conf.main_url.geturl() + self.configuration.main_url.geturl() + f"/User/{userid}/{usertitle.replace('/', ' ')}" ) - final_href = Global.normalize_href(href) + final_href = self.processor.normalize_href(href) return final_href[1:] def get_user_link_from_obj(self, user): if "userid" not in user or not user["userid"]: - raise UnexpectedDataKindException( + raise UnexpectedDataKindExceptionError( f"Impossible to extract user id from {user}" ) userid = user["userid"] @@ -62,20 +62,20 @@ def get_user_link_from_props(self, userid, usertitle): user_path = urllib.parse.quote( self._build_user_path(userid=userid, usertitle=usertitle) ) - if Global.conf.no_user: + if self.configuration.no_user: return f"home/not_scrapped?url={user_path}" - if Global.conf.users and str(userid) not in Global.conf.users: + if self.configuration.users and str(userid) not in self.configuration.users: return f"home/not_scrapped?url={user_path}" self._add_user_to_scrape(userid, usertitle, False) return user_path def build_expected_items(self): - if Global.conf.no_user: + if self.configuration.no_user: logger.info("No user required") return - if Global.conf.users: + if self.configuration.users: logger.info("Adding required users as expected") - for userid in Global.conf.users: + for userid in self.configuration.users: self._add_user_to_scrape(userid, UNKNOWN_TITLE, True) return # WE DO NOT BUILD A LIST OF EXPECTED USERS, THE LIST IS WAY TOO BIG WITH LOTS @@ -93,41 +93,41 @@ def build_expected_items(self): # offset += limit # logger.info("{} user found".format(len(self.expected_items_keys))) - def get_one_item_content(self, item_key, item_data): + def get_one_item_content(self, item_key, _): # ARG002 userid = item_key - user_content = get_api_content(f"/users/{userid}") + user_content = self.utils.get_api_content(f"/users/{userid}") # other content is available in other endpoints, but not retrieved for now # (badges: not easy to process ; guides: does not seems to work properly) return user_content - def add_item_redirect(self, item_key, item_data, redirect_kind): + def add_item_redirect(self, _, item_data, redirect_kind): userid = item_data["userid"] usertitle = item_data["usertitle"] if usertitle == UNKNOWN_TITLE: logger.warning(f"Cannot add redirect for user {userid} in error") return path = self._build_user_path(userid, usertitle) - Global.add_redirect( + self.processor.add_redirect( path=path, target_path=f"home/{redirect_kind}?{urllib.parse.urlencode({'url':path})}", ) - def process_one_item(self, item_key, item_data, item_content): + def process_one_item(self, _, item_data, item_content): userid = item_data["userid"] usertitle = item_data["usertitle"] user_content = item_content user_rendered = self.user_template.render( user=user_content, - label=USER_LABELS[Global.conf.lang_code], - metadata=Global.metadata, + label=USER_LABELS[self.configuration.lang_code], + metadata=self.metadata, ) normal_path = self._build_user_path( userid=user_content["userid"], usertitle=user_content["username"], ) - Global.add_html_item( + self.processor.add_html_item( path=normal_path, title=user_content["username"], content=user_rendered, @@ -146,7 +146,7 @@ def process_one_item(self, item_key, item_data, item_content): "Adding user redirect for alternate user path from " f"{alternate_path} to {normal_path}" ) - Global.add_redirect( + self.processor.add_redirect( path=alternate_path, target_path=normal_path, ) diff --git a/src/ifixit2zim/shared.py b/src/ifixit2zim/shared.py new file mode 100644 index 0000000..2752b8f --- /dev/null +++ b/src/ifixit2zim/shared.py @@ -0,0 +1,34 @@ +import locale +import logging +import threading +from contextlib import contextmanager + +from zimscraperlib.logging import getLogger as lib_getLogger + +from ifixit2zim.constants import NAME + +logger = lib_getLogger( + NAME, + level=logging.INFO, + log_format="[%(threadName)s::%(asctime)s] %(levelname)s:%(message)s", +) + + +def set_debug(value): + level = logging.DEBUG if value else logging.INFO + logger.setLevel(level) + for handler in logger.handlers: + handler.setLevel(level) + + +LOCALE_LOCK = threading.Lock() + + +@contextmanager +def setlocale(name): + with LOCALE_LOCK: + saved = locale.setlocale(locale.LC_ALL) + try: + yield locale.setlocale(locale.LC_ALL, name) + finally: + locale.setlocale(locale.LC_ALL, saved) diff --git a/ifixit2zim/templates/base.html b/src/ifixit2zim/templates/base.html similarity index 100% rename from ifixit2zim/templates/base.html rename to src/ifixit2zim/templates/base.html diff --git a/ifixit2zim/templates/category.html b/src/ifixit2zim/templates/category.html similarity index 100% rename from ifixit2zim/templates/category.html rename to src/ifixit2zim/templates/category.html diff --git a/ifixit2zim/templates/external_content.html b/src/ifixit2zim/templates/external_content.html similarity index 100% rename from ifixit2zim/templates/external_content.html rename to src/ifixit2zim/templates/external_content.html diff --git a/ifixit2zim/templates/guide-comment.html b/src/ifixit2zim/templates/guide-comment.html similarity index 100% rename from ifixit2zim/templates/guide-comment.html rename to src/ifixit2zim/templates/guide-comment.html diff --git a/ifixit2zim/templates/guide-comments.html b/src/ifixit2zim/templates/guide-comments.html similarity index 100% rename from ifixit2zim/templates/guide-comments.html rename to src/ifixit2zim/templates/guide-comments.html diff --git a/ifixit2zim/templates/guide-step-lines-container.html b/src/ifixit2zim/templates/guide-step-lines-container.html similarity index 100% rename from ifixit2zim/templates/guide-step-lines-container.html rename to src/ifixit2zim/templates/guide-step-lines-container.html diff --git a/ifixit2zim/templates/guide.html b/src/ifixit2zim/templates/guide.html similarity index 100% rename from ifixit2zim/templates/guide.html rename to src/ifixit2zim/templates/guide.html diff --git a/ifixit2zim/templates/home.html b/src/ifixit2zim/templates/home.html similarity index 100% rename from ifixit2zim/templates/home.html rename to src/ifixit2zim/templates/home.html diff --git a/ifixit2zim/templates/info.html b/src/ifixit2zim/templates/info.html similarity index 100% rename from ifixit2zim/templates/info.html rename to src/ifixit2zim/templates/info.html diff --git a/ifixit2zim/templates/not_here.html b/src/ifixit2zim/templates/not_here.html similarity index 100% rename from ifixit2zim/templates/not_here.html rename to src/ifixit2zim/templates/not_here.html diff --git a/ifixit2zim/templates/user.html b/src/ifixit2zim/templates/user.html similarity index 100% rename from ifixit2zim/templates/user.html rename to src/ifixit2zim/templates/user.html diff --git a/src/ifixit2zim/utils.py b/src/ifixit2zim/utils.py new file mode 100644 index 0000000..d2e6871 --- /dev/null +++ b/src/ifixit2zim/utils.py @@ -0,0 +1,167 @@ +import io +import re +import urllib.parse +import zlib +from http import HTTPStatus + +import backoff +import bs4 +import requests +from kiwixstorage import KiwixStorage +from pif import get_public_ip +from zimscraperlib.download import _get_retry_adapter, stream_file + +from ifixit2zim.constants import API_PREFIX, Configuration +from ifixit2zim.shared import logger + + +def backoff_hdlr(details): + logger.warning( + "Backing off {wait:0.1f} seconds after {tries} tries " + "calling function {target} with args {args} and kwargs " + "{kwargs}".format(**details) + ) + + +class Utils: + def __init__(self, configuration: Configuration) -> None: + self.configuration = configuration + + def to_path(self, url: str) -> str: + """Path-part of an URL, without leading slash""" + return re.sub(r"^/", "", urllib.parse.urlparse(url).path) + + def get_url(self, path: str, **params) -> str: + """url-encoded in-source website url for a path""" + params_str = f"?{urllib.parse.urlencode(params)}" if params else "" + return ( + f"{self.configuration.main_url.geturl()}" + f"{urllib.parse.quote(path)}" + f"{params_str}" + ) + + def get_url_raw(self, path: str): + """in-source website url for a path, untainted""" + return f"{self.configuration.main_url.geturl()}{path}" + + def to_url(self, value: str) -> str: + """resolved potentially relative url from in-source link""" + return value if value.startswith("http") else self.get_url_raw(value) + + def to_rel(self, url: str) -> None | str: + """path from URL if on our main domain, else None""" + uri = urllib.parse.urlparse(url) + if uri.netloc != self.configuration.domain: + return None + return uri.path + + def no_leading_slash(self, text: str) -> str: + """text with leading slash removed if present""" + return re.sub(r"^/", "", text) + + def no_trailing_slash(self, text: str) -> str: + """text with trailing slash removed if present""" + return re.sub(r"/$", "", text) + + def only_path_of(self, url: str): + """normalized path part of an url""" + return self.normalize_ident(urllib.parse.urlparse(url).path) + + def fetch(self, path: str, **params) -> tuple[str, list[str]]: + """(source text, actual_paths) of a path from source website + + actual_paths is amn ordered list of paths that were traversed to get to content. + Without redirection, it should be a single path, equal to request + Final, target path is always last""" + session = requests.Session() + session.mount("http", _get_retry_adapter(10)) # tied to http and https + resp = session.get(self.get_url(path, **params), params=params) + resp.raise_for_status() + + # we have params meaning we requested a page (?pg=xxx) + # assumption: this must be a category page (so on same domain) + # we thus need to use redirection target (which lost param) with params + if params and resp.history: + return self.fetch(self.only_path_of(resp.url), **params) + return resp.text, [ + self.no_leading_slash(self.only_path_of(r.url)) + for r in [*resp.history, resp] + ] + + def get_soup_of(self, text: str, *, unwrap: bool = False): + """an lxml soup of an HTML string""" + soup = bs4.BeautifulSoup(text, "lxml") + if unwrap: + for elem in ("body", "html"): + getattr(soup, elem).unwrap() + return soup + + def get_soup(self, path: str, **params) -> tuple[bs4.BeautifulSoup, list[str]]: + """an lxml soup of a path on source website""" + content, paths = self.fetch(path, **params) + return self.get_soup_of(content), paths + + def get_digest(self, url: str) -> str: + """simple digest of an url for mapping purpose""" + return str(zlib.adler32(url.encode("UTF-8"))) + + def normalize_ident(self, ident: str) -> str: + """URL-decoded category identifier""" + return urllib.parse.unquote(ident) + + def get_version_ident_for(self, url: str) -> str | None: + """~version~ of the URL data to use for comparisons. Built from headers""" + try: + resp = requests.head(url, timeout=10) + headers = resp.headers + except Exception as exc: + logger.warning(f"Unable to HEAD {url}") + logger.exception(exc) + try: + _, headers = stream_file( + url=url, + byte_stream=io.BytesIO(), + block_size=1, + only_first_block=True, + ) + except Exception as exc2: + logger.warning(f"Unable to query image at {url}") + logger.exception(exc2) + return + + for header in ("ETag", "Last-Modified", "Content-Length"): + if headers.get(header): + return headers.get(header) + + return "-1" + + def setup_s3_and_check_credentials(self, s3_url_with_credentials): + logger.info("testing S3 Optimization Cache credentials") + s3_storage = KiwixStorage(s3_url_with_credentials) + if not s3_storage.check_credentials( + list_buckets=True, bucket=True, write=True, read=True, failsafe=True + ): + logger.error("S3 cache connection error testing permissions.") + logger.error(f" Server: {s3_storage.url.netloc}") + logger.error(f" Bucket: {s3_storage.bucket_name}") + logger.error(f" Key ID: {s3_storage.params.get('keyid')}") + logger.error(f" Public IP: {get_public_ip()}") + raise ValueError("Unable to connect to Optimization Cache. Check its URL.") + return s3_storage + + @backoff.on_exception( + backoff.expo, + requests.exceptions.RequestException, + max_time=16, + on_backoff=backoff_hdlr, + ) + def get_api_content(self, path, **params): + full_path = self.get_url(API_PREFIX + path, **params) + logger.debug(f"Retrieving {full_path}") + response = requests.get(full_path, timeout=10) + json_data = ( + response.json() + if response and response.status_code == HTTPStatus.OK + else None + ) + return json_data diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..a95c71a --- /dev/null +++ b/tasks.py @@ -0,0 +1,109 @@ +# pyright: strict, reportUntypedFunctionDecorator=false +import os + +from invoke.context import Context +from invoke.tasks import task # pyright: ignore [reportUnknownVariableType] + +use_pty = not os.getenv("CI", "") + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test(ctx: Context, args: str = ""): + """run tests (without coverage)""" + ctx.run(f"pytest {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test_cov(ctx: Context, args: str = ""): + """run test vith coverage""" + ctx.run(f"coverage run -m pytest {args}", pty=use_pty) + + +@task(optional=["html"], help={"html": "flag to export html report"}) +def report_cov(ctx: Context, *, html: bool = False): + """report coverage""" + ctx.run("coverage combine", warn=True, pty=use_pty) + ctx.run("coverage report --show-missing", pty=use_pty) + if html: + ctx.run("coverage html", pty=use_pty) + + +@task( + optional=["args", "html"], + help={ + "args": "pytest additional arguments", + "html": "flag to export html report", + }, +) +def coverage(ctx: Context, args: str = "", *, html: bool = False): + """run tests and report coverage""" + test_cov(ctx, args=args) + report_cov(ctx, html=html) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def lint_black(ctx: Context, args: str = "."): + args = args or "." # needed for hatch script + ctx.run("black --version", pty=use_pty) + ctx.run(f"black --check --diff {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def lint_ruff(ctx: Context, args: str = "."): + args = args or "." # needed for hatch script + ctx.run("ruff --version", pty=use_pty) + ctx.run(f"ruff check {args}", pty=use_pty) + + +@task( + optional=["args"], + help={ + "args": "linting tools (black, ruff) additional arguments, typically a path", + }, +) +def lintall(ctx: Context, args: str = "."): + """Check linting""" + args = args or "." # needed for hatch script + lint_black(ctx, args) + lint_ruff(ctx, args) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def check_pyright(ctx: Context, args: str = ""): + """check static types with pyright""" + ctx.run("pyright --version") + ctx.run(f"pyright {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def checkall(ctx: Context, args: str = ""): + """check static types""" + check_pyright(ctx, args) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def fix_black(ctx: Context, args: str = "."): + """fix black formatting""" + args = args or "." # needed for hatch script + ctx.run(f"black {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def fix_ruff(ctx: Context, args: str = "."): + """fix all ruff rules""" + args = args or "." # needed for hatch script + ctx.run(f"ruff check --fix {args}", pty=use_pty) + + +@task( + optional=["args"], + help={ + "args": "linting tools (black, ruff) additional arguments, typically a path", + }, +) +def fixall(ctx: Context, args: str = "."): + """Fix everything automatically""" + args = args or "." # needed for hatch script + fix_black(ctx, args) + fix_ruff(ctx, args) + lintall(ctx, args) diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..0ede956 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,5 @@ +from ifixit2zim.__about__ import __version__ + + +def test_version(): + assert __version__