Skip to content

Commit

Permalink
Merge pull request #302 from kiwix/test-mirrors
Browse files Browse the repository at this point in the history
Add mirrors check to test-suite
  • Loading branch information
rgaudin authored Nov 7, 2024
2 parents 79e1175 + 81d2d04 commit 95ec3b3
Show file tree
Hide file tree
Showing 8 changed files with 413 additions and 41 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/library_check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
run: pip install -r test-suite/requirements.txt
- name: Test library.kiwix.org
working-directory: test-suite
run: pytest -vvv
run: pytest -vvv test_library.py

dev-library:
runs-on: ubuntu-22.04
Expand All @@ -39,4 +39,4 @@ jobs:
SCHEMES: http,https
LIBRARY_HOST: dev.library.kiwix.org
TIMEOUT: 30
run: pytest -vvv -m "not varnish"
run: pytest -vvv -m "not varnish" test_library.py
24 changes: 24 additions & 0 deletions .github/workflows/mirrors_check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Mirrors Check

on:
schedule:
- cron: "0 8 * * *"
push:
branches:
- main

jobs:

mirrors:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: 3.12
architecture: x64
- name: Install dependencies
run: pip install -r test-suite/requirements.txt
- name: Test mirrors
working-directory: test-suite
run: pytest -vvv test_mirrors.py
156 changes: 120 additions & 36 deletions test-suite/README.md

Large diffs are not rendered by default.

95 changes: 94 additions & 1 deletion test-suite/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,47 @@
import re
import urllib.parse

import pytest
import requests
from utils import get_url
from utils import Mirror, get_current_mirrors, get_url

# MB only provides the full list of mirrors through this.
MIRRORS_LIST_URL: str = "https://download.kiwix.org/mirrors.html"
# list of mirrors (hostname) not to use in tests
EXCLUDED_MIRRORS: list[str] = [
# samei is itself a load-balancer. all have same config. testing only laotzu
"saimei.ftp.acc.umu.se",
]
# this is using the permalink pattern
# from the permalink redirects (no warehouse path, no period in filename)
# using wikipedia_he_* as this is the only pattern mirrored by all mirrors
# good enough for now
PERMANENT_ZIM_URL: str = "https://download.kiwix.org/zim/wikipedia_he_all_maxi.zim"
# all non-zim-only mirrors mirror all other files
PERMANENT_APK_URL: str = "https://download.kiwix.org/release/kiwix-android/kiwix.apk"
# we have no other way to know which mirrors are supposed to have APKs
# taken from https://github.com/kiwix/container-images
# /blob/main/mirrorbrain/bin/update_mirrorbrain_db.sh
EXPECTED_APK_MIRRORS: list[str] = [
"mirrors.dotsrc.org",
"mirror.download.kiwix.org",
"ftp.nluug.nl",
"ftp.fau.de",
"md.mirrors.hacktegic.com",
"mirror-sites-fr.mblibrary.info",
"mirror-sites-ca.mblibrary.info",
"mirror-sites-in.mblibrary.info",
]
ZIM_MIRRORS: list[Mirror] = get_current_mirrors(MIRRORS_LIST_URL, EXCLUDED_MIRRORS)
# IDs are used for pytest output
ZIM_MIRRORS_IDS: list[str] = [mirror.hostname for mirror in ZIM_MIRRORS]
# we could have discovered mirrors via testing an actual APK online which would
# have included any extra mirror serving it but it causes other issues (chicken/egg)
# and expecting maintenance of tests on mirror update is OK
APK_MIRRORS: list[Mirror] = [
mirror for mirror in ZIM_MIRRORS if mirror.hostname in EXPECTED_APK_MIRRORS
]
APK_MIRRORS_IDS: list[str] = [mirror.hostname for mirror in APK_MIRRORS]


@pytest.fixture(scope="session")
Expand All @@ -13,3 +52,57 @@ def illus_endpoint():
match = re.search(r"/catalog/v2/illustration/([^/]+)/\?size=48", line)
if match:
yield match.group()


@pytest.fixture(scope="session")
def mirrors_list_url():
yield MIRRORS_LIST_URL


@pytest.fixture(scope="session")
def excluded_mirrors():
yield EXCLUDED_MIRRORS


@pytest.fixture(scope="session")
def permanent_zim_url():
yield PERMANENT_ZIM_URL


@pytest.fixture(scope="session")
def current_zim_url(permanent_zim_url):
resp = requests.head(permanent_zim_url, allow_redirects=False, timeout=5)
resp.raise_for_status()
yield resp.headers["Location"]


@pytest.fixture(scope="session")
def current_zim_path(current_zim_url):
yield urllib.parse.urlparse(current_zim_url).path.lstrip("/")


@pytest.fixture(scope="session")
def permanent_apk_url():
yield PERMANENT_APK_URL


@pytest.fixture(scope="session")
def current_apk_url(permanent_apk_url):
resp = requests.head(permanent_apk_url, allow_redirects=False, timeout=5)
resp.raise_for_status()
yield resp.headers["Location"]


@pytest.fixture(scope="session")
def current_apk_path(current_apk_url):
yield urllib.parse.urlparse(current_apk_url).path.lstrip("/")


@pytest.fixture(scope="session")
def current_zim_mirrors():
yield ZIM_MIRRORS


@pytest.fixture(scope="session")
def current_apk_mirrors():
yield APK_MIRRORS
1 change: 1 addition & 0 deletions test-suite/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
requests==2.31.0
pytest==7.4.0
beautifulsoup4==4.12.3
File renamed without changes.
124 changes: 124 additions & 0 deletions test-suite/test_mirrors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from http import HTTPStatus
from urllib.parse import urljoin

import pytest
import requests
from conftest import (
APK_MIRRORS,
APK_MIRRORS_IDS,
EXPECTED_APK_MIRRORS,
ZIM_MIRRORS,
ZIM_MIRRORS_IDS,
)
from utils import TIMEOUT, Mirror


def test_mirrors_list_reachable(mirrors_list_url):
assert requests.head(mirrors_list_url).status_code == HTTPStatus.FOUND
assert (
requests.head(mirrors_list_url, allow_redirects=True).status_code
== HTTPStatus.OK
)


def test_zim_exists(permanent_zim_url, current_zim_url):
assert (
requests.head(permanent_zim_url, allow_redirects=True).status_code
== HTTPStatus.OK
)
assert (
requests.head(current_zim_url, allow_redirects=True).status_code
== HTTPStatus.OK
)


def test_zim_permalink(permanent_zim_url, current_zim_url):
assert (
requests.head(permanent_zim_url, allow_redirects=True, timeout=TIMEOUT).url
== requests.head(current_zim_url, allow_redirects=True, timeout=TIMEOUT).url
)


def test_zim_mirrors_list(current_zim_mirrors):
# arbitrary number ; should fail if we dont get this (currently it's 14)
assert len(current_zim_mirrors) >= 12


@pytest.mark.parametrize("mirror", ZIM_MIRRORS, ids=ZIM_MIRRORS_IDS)
def test_mirror_has_zim_file(mirror: Mirror, current_zim_path: str):
url = urljoin(mirror.base_url, current_zim_path)
assert (
requests.head(url, timeout=TIMEOUT, allow_redirects=False).status_code
== HTTPStatus.OK
)


@pytest.mark.parametrize("mirror", ZIM_MIRRORS, ids=ZIM_MIRRORS_IDS)
def test_mirror_zim_has_contenttype(mirror: Mirror, current_zim_path: str):
url = urljoin(mirror.base_url, current_zim_path)
assert requests.head(url, timeout=TIMEOUT, allow_redirects=False).headers.get(
"content-type"
)


@pytest.mark.parametrize("mirror", ZIM_MIRRORS, ids=ZIM_MIRRORS_IDS)
def test_mirror_zim_contenttype(mirror: Mirror, current_zim_path: str):
url = urljoin(mirror.base_url, current_zim_path)
print(url)
ctype = requests.head(url, timeout=TIMEOUT, allow_redirects=False).headers.get(
"content-type"
)
if ctype is None:
pytest.xfail("no content-type")
assert ctype == "application/octet-stream"


def test_apk_exists(permanent_apk_url, current_apk_url):
assert (
requests.head(permanent_apk_url, allow_redirects=True).status_code
== HTTPStatus.OK
)
assert (
requests.head(current_apk_url, allow_redirects=True).status_code
== HTTPStatus.OK
)


def test_apk_permalink(permanent_apk_url, current_apk_url):
assert (
requests.head(permanent_apk_url, allow_redirects=True, timeout=TIMEOUT).url
== requests.head(current_apk_url, allow_redirects=True, timeout=TIMEOUT).url
)


def test_apk_mirrors_list(current_apk_mirrors):
# ATM this is no-op but prevents further issues
assert len(current_apk_mirrors) >= len(EXPECTED_APK_MIRRORS)


@pytest.mark.parametrize("mirror", APK_MIRRORS, ids=APK_MIRRORS_IDS)
def test_mirror_has_apk_file(mirror: Mirror, current_apk_path: str):
url = urljoin(mirror.base_url, current_apk_path)
assert (
requests.head(url, timeout=TIMEOUT, allow_redirects=False).status_code
== HTTPStatus.OK
)


@pytest.mark.parametrize("mirror", APK_MIRRORS, ids=APK_MIRRORS_IDS)
def test_mirror_apk_has_contenttype(mirror: Mirror, current_apk_path: str):
url = urljoin(mirror.base_url, current_apk_path)
assert requests.head(url, timeout=TIMEOUT, allow_redirects=False).headers.get(
"content-type"
)


@pytest.mark.parametrize("mirror", APK_MIRRORS, ids=APK_MIRRORS_IDS)
def test_mirror_apk_contenttype(mirror: Mirror, current_apk_path: str):
url = urljoin(mirror.base_url, current_apk_path)
ctype = requests.head(url, timeout=TIMEOUT, allow_redirects=False).headers.get(
"content-type"
)
if ctype is None:
pytest.xfail("no content-type")
assert ctype == "application/vnd.android.package-archive"
50 changes: 48 additions & 2 deletions test-suite/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# pyright: reportImplicitStringConcatenation=false
import os
from typing import Any, NamedTuple
from urllib.parse import urlsplit

import requests
from bs4 import BeautifulSoup, NavigableString
from bs4.element import Tag

TIMEOUT = int(os.getenv("TIMEOUT") or "5")
SCHEMES = os.getenv("SCHEMES", "https").split(",")
Expand All @@ -28,7 +32,6 @@
}



def get_url(
path="/",
scheme=DEFAULT_SCHEME,
Expand All @@ -42,11 +45,54 @@ def get_response_headers(path, method="HEAD", scheme=DEFAULT_SCHEME):
method=method,
url=get_url(path=path, scheme=scheme),
headers={"Accept-Encoding": "gzip, deflate, br"},
timeout=TIMEOUT
timeout=TIMEOUT,
).headers


def is_cached(path, method="GET", scheme=DEFAULT_SCHEME):
for _ in range(2):
ret = get_response_headers(path, method=method, scheme=scheme).get("X-Varnish")
return len(ret.split(" ")) >= 2


class Mirror(NamedTuple):
hostname: str
base_url: str
country_code: str


def get_current_mirrors(
mirrors_list_url: str, excluded_mirrors: list[str]
) -> list[Mirror]:
"""Current mirrors from the mirrors url."""

def is_country_row(tag: Tag) -> bool:
"""Filters out table rows that do not contain mirror data."""
return tag.name == "tr" and tag.findChild("td", class_="newregion") is None

resp = requests.get(mirrors_list_url, timeout=TIMEOUT, allow_redirects=True)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, features="html.parser")
body = soup.find("tbody")

if body is None or isinstance(body, NavigableString | int):
raise ValueError(f"unable to parse mirrors information from {mirrors_list_url}")

mirrors: list[Mirror] = []

for row in body.find_all(is_country_row):
base_url = row.find("a", string="HTTP")["href"]
hostname: Any = urlsplit(
base_url
).netloc # pyright: ignore [reportUnknownMemberType]
country_code = row.find("img")["alt"].lower()
if hostname in excluded_mirrors:
continue
mirrors.append(
Mirror(
hostname=hostname,
base_url=base_url,
country_code=country_code,
)
)
return mirrors

0 comments on commit 95ec3b3

Please sign in to comment.