From f0aeb5c3b4525616b789091d69441930552eaa29 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 24 Oct 2024 12:56:47 +0000 Subject: [PATCH] Return complete url rewrite information so that 'users' can decide what they want to do --- src/zimscraperlib/rewriting/css.py | 8 +- src/zimscraperlib/rewriting/html.py | 11 +- src/zimscraperlib/rewriting/js.py | 2 +- src/zimscraperlib/rewriting/url_rewriting.py | 53 ++- tests/rewriting/conftest.py | 9 +- tests/rewriting/test_url_rewriting.py | 464 +++++++++++++++---- 6 files changed, 434 insertions(+), 113 deletions(-) diff --git a/src/zimscraperlib/rewriting/css.py b/src/zimscraperlib/rewriting/css.py index d992ad9b..c04cbcf0 100644 --- a/src/zimscraperlib/rewriting/css.py +++ b/src/zimscraperlib/rewriting/css.py @@ -51,7 +51,7 @@ def __simple_transform( [ "url(", m_object["quote"], - url_rewriter(m_object["url"], base_href), + url_rewriter(m_object["url"], base_href).rewriten_url, m_object["quote"], ")", ] @@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node): new_url = self.url_rewriter( url_node.value, # pyright: ignore self.base_href, - ) + ).rewriten_url url_node.value = str(new_url) # pyright: ignore url_node.representation = ( # pyright: ignore f'"{serialize_url(str(new_url))}"' @@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node): elif isinstance(node, ast.Declaration): self._process_list(node.value) # pyright: ignore elif isinstance(node, ast.URLToken): - new_url = self.url_rewriter(node.value, self.base_href) # pyright: ignore + new_url = self.url_rewriter( + node.value, self.base_href + ).rewriten_url # pyright: ignore node.value = new_url node.representation = f"url({serialize_url(new_url)})" diff --git a/src/zimscraperlib/rewriting/html.py b/src/zimscraperlib/rewriting/html.py index 224e9a51..cd78aab4 100644 --- a/src/zimscraperlib/rewriting/html.py +++ b/src/zimscraperlib/rewriting/html.py @@ -603,7 +603,9 @@ def rewrite_href_src_attributes( notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href)) return ( attr_name, - url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"), + url_rewriter( + attr_value, base_href=base_href, rewrite_all_url=tag != "a" + ).rewriten_url, ) @@ -618,10 +620,10 @@ def rewrite_srcset_attribute( if attr_name != "srcset" or not attr_value: return value_list = attr_value.split(",") - new_value_list = [] + new_value_list: list[str] = [] for value in value_list: url, *other = value.strip().split(" ", maxsplit=1) - new_url = url_rewriter(url, base_href=base_href) + new_url = url_rewriter(url, base_href=base_href).rewriten_url new_value = " ".join([new_url, *other]) new_value_list.append(new_value) return (attr_name, ", ".join(new_value_list)) @@ -711,5 +713,6 @@ def rewrite_meta_http_equiv_redirect( return return ( attr_name, - f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}", + f"{match['interval']};" + f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}", ) diff --git a/src/zimscraperlib/rewriting/js.py b/src/zimscraperlib/rewriting/js.py index 4fd93d68..237243fa 100644 --- a/src/zimscraperlib/rewriting/js.py +++ b/src/zimscraperlib/rewriting/js.py @@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str: This takes into account that the result must be a relative URL, i.e. it cannot be 'vendor.module.js' but must be './vendor.module.js'. """ - url = self.url_rewriter(url, base_href=self.base_href) + url = self.url_rewriter(url, base_href=self.base_href).rewriten_url if not ( url.startswith("/") or url.startswith("./") or url.startswith("../") ): diff --git a/src/zimscraperlib/rewriting/url_rewriting.py b/src/zimscraperlib/rewriting/url_rewriting.py index 52d2abde..2b582bd9 100644 --- a/src/zimscraperlib/rewriting/url_rewriting.py +++ b/src/zimscraperlib/rewriting/url_rewriting.py @@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None: raise ValueError(f"Unexpected password in value: {value} {parts.password}") +class RewriteResult(NamedTuple): + absolute_url: str + rewriten_url: str + zim_path: ZimPath | None + + class ArticleUrlRewriter: """ Rewrite urls in article. @@ -176,16 +182,11 @@ def __init__( missing_zim_paths: list of ZIM paths which are known to already be missing from the existing_zim_paths ; usefull only in complement with this variable ; new missing entries will be added as URLs are normalized in this function - - Results: - items_to_download: populated with the list of rewritten URLs, so that one - might use it to download items after rewriting the document """ self.article_path = article_path or ArticleUrlRewriter.normalize(article_url) self.article_url = article_url self.existing_zim_paths = existing_zim_paths self.missing_zim_paths = missing_zim_paths - self.items_to_download: dict[ZimPath, HttpUrl] = {} def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath: """Utility to transform an item URL into a ZimPath""" @@ -201,7 +202,7 @@ def __call__( base_href: str | None, *, rewrite_all_url: bool = True, - ) -> str: + ) -> RewriteResult: """Rewrite a url contained in a article. The url is "fully" rewrited to point to a normalized entry path @@ -210,17 +211,25 @@ def __call__( try: item_url = item_url.strip() + item_absolute_url = urljoin( + urljoin(self.article_url.value, base_href), item_url + ) + # Make case of standalone fragments more straightforward if item_url.startswith("#"): - return item_url + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=item_url, + zim_path=None, + ) item_scheme = urlsplit(item_url).scheme if item_scheme and item_scheme not in ("http", "https"): - return item_url - - item_absolute_url = urljoin( - urljoin(self.article_url.value, base_href), item_url - ) + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=item_url, + zim_path=None, + ) item_fragment = urlsplit(item_absolute_url).fragment @@ -229,9 +238,11 @@ def __call__( if rewrite_all_url or ( self.existing_zim_paths and item_path in self.existing_zim_paths ): - if item_path not in self.items_to_download: - self.items_to_download[item_path] = HttpUrl(item_absolute_url) - return self.get_document_uri(item_path, item_fragment) + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=self.get_document_uri(item_path, item_fragment), + zim_path=item_path, + ) else: if ( self.missing_zim_paths is not None @@ -242,7 +253,11 @@ def __call__( # with duplicate messages self.missing_zim_paths.add(item_path) # The url doesn't point to a known entry - return item_absolute_url + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=item_absolute_url, + zim_path=item_path, + ) except Exception as exc: # pragma: no cover item_scheme = ( @@ -275,7 +290,11 @@ def __call__( f"rewrite_all_url: {rewrite_all_url}", exc_info=exc, ) - return item_url + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=item_url, + zim_path=None, + ) def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str: """Given an ZIM item path and its fragment, get the URI to use in document diff --git a/tests/rewriting/conftest.py b/tests/rewriting/conftest.py index 62fb4410..40ccbc25 100644 --- a/tests/rewriting/conftest.py +++ b/tests/rewriting/conftest.py @@ -7,6 +7,7 @@ from zimscraperlib.rewriting.url_rewriting import ( ArticleUrlRewriter, HttpUrl, + RewriteResult, ZimPath, ) @@ -24,8 +25,12 @@ def __call__( base_href: str | None, # noqa: ARG002 *, rewrite_all_url: bool = True, # noqa: ARG002 - ) -> str: - return item_url + self.suffix + ) -> RewriteResult: + return RewriteResult( + absolute_url=item_url + self.suffix, + rewriten_url=item_url + self.suffix, + zim_path=None, + ) def get_item_path( self, item_url: str, base_href: str | None # noqa: ARG002 diff --git a/tests/rewriting/test_url_rewriting.py b/tests/rewriting/test_url_rewriting.py index c14e9f84..4d7213ee 100644 --- a/tests/rewriting/test_url_rewriting.py +++ b/tests/rewriting/test_url_rewriting.py @@ -3,6 +3,7 @@ from zimscraperlib.rewriting.url_rewriting import ( ArticleUrlRewriter, HttpUrl, + RewriteResult, ZimPath, ) @@ -147,237 +148,372 @@ def test_missing_zim_paths( assert missing_zim_paths == expected_missing_zim_paths @pytest.mark.parametrize( - "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "article_url, original_content_url, expected_rewrite_result, know_paths, " "rewrite_all_url", [ ( "https://kiwix.org/a/article/document.html", "foo.html", - "foo.html", + RewriteResult( + "https://kiwix.org/a/article/foo.html", + "foo.html", + ZimPath("kiwix.org/a/article/foo.html"), + ), ["kiwix.org/a/article/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "foo.html#anchor1", - "foo.html#anchor1", + RewriteResult( + "https://kiwix.org/a/article/foo.html#anchor1", + "foo.html#anchor1", + ZimPath("kiwix.org/a/article/foo.html"), + ), ["kiwix.org/a/article/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "foo.html?foo=bar", - "foo.html%3Ffoo%3Dbar", + RewriteResult( + "https://kiwix.org/a/article/foo.html?foo=bar", + "foo.html%3Ffoo%3Dbar", + ZimPath("kiwix.org/a/article/foo.html?foo=bar"), + ), ["kiwix.org/a/article/foo.html?foo=bar"], False, ), ( "https://kiwix.org/a/article/document.html", "foo.html?foo=b%24ar", - "foo.html%3Ffoo%3Db%24ar", + RewriteResult( + "https://kiwix.org/a/article/foo.html?foo=b%24ar", + "foo.html%3Ffoo%3Db%24ar", + ZimPath("kiwix.org/a/article/foo.html?foo=b$ar"), + ), ["kiwix.org/a/article/foo.html?foo=b$ar"], False, ), ( "https://kiwix.org/a/article/document.html", "foo.html?foo=b%3Far", # a query string with an encoded ? char in value - "foo.html%3Ffoo%3Db%3Far", + RewriteResult( + "https://kiwix.org/a/article/foo.html?foo=b%3Far", + "foo.html%3Ffoo%3Db%3Far", + ZimPath("kiwix.org/a/article/foo.html?foo=b?ar"), + ), ["kiwix.org/a/article/foo.html?foo=b?ar"], False, ), ( "https://kiwix.org/a/article/document.html", "fo%o.html", - "fo%25o.html", + RewriteResult( + "https://kiwix.org/a/article/fo%o.html", + "fo%25o.html", + ZimPath("kiwix.org/a/article/fo%o.html"), + ), ["kiwix.org/a/article/fo%o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "foé.html", # URL not matching RFC 3986 (found in invalid HTML doc) - "fo%C3%A9.html", # character is encoded so that URL match RFC 3986 - ["kiwix.org/a/article/foé.html"], # but ZIM path is non-encoded + RewriteResult( + "https://kiwix.org/a/article/foé.html", + "fo%C3%A9.html", # character is encoded so that URL match RFC 3986 + ZimPath( + "kiwix.org/a/article/foé.html" + ), # but ZIM path is non-encoded + ), + ["kiwix.org/a/article/foé.html"], False, ), ( "https://kiwix.org/a/article/document.html", "./foo.html", - "foo.html", + RewriteResult( + "https://kiwix.org/a/article/foo.html", + "foo.html", + ZimPath("kiwix.org/a/article/foo.html"), + ), ["kiwix.org/a/article/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "../foo.html", - "https://kiwix.org/a/foo.html", # Full URL since not in known URLs + RewriteResult( + "https://kiwix.org/a/foo.html", + "https://kiwix.org/a/foo.html", # Full URL since not in known URLs + ZimPath("kiwix.org/a/foo.html"), + ), ["kiwix.org/a/article/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "../foo.html", - "../foo.html", # all URLs rewrite activated + RewriteResult( + "https://kiwix.org/a/foo.html", + "../foo.html", # all URLs rewrite activated + ZimPath("kiwix.org/a/foo.html"), + ), ["kiwix.org/a/article/foo.html"], True, ), ( "https://kiwix.org/a/article/document.html", "../foo.html", - "../foo.html", + RewriteResult( + "https://kiwix.org/a/foo.html", + "../foo.html", + ZimPath("kiwix.org/a/foo.html"), + ), ["kiwix.org/a/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "../bar/foo.html", - "https://kiwix.org/a/bar/foo.html", # Full URL since not in known URLs + RewriteResult( + "https://kiwix.org/a/bar/foo.html", + "https://kiwix.org/a/bar/foo.html", # Full URL since not in known + # URLs + ZimPath("kiwix.org/a/bar/foo.html"), + ), ["kiwix.org/a/article/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "../bar/foo.html", - "../bar/foo.html", # all URLs rewrite activated + RewriteResult( + "https://kiwix.org/a/bar/foo.html", + "../bar/foo.html", # all URLs rewrite activated + ZimPath("kiwix.org/a/bar/foo.html"), + ), ["kiwix.org/a/article/foo.html"], True, ), ( "https://kiwix.org/a/article/document.html", "../bar/foo.html", - "../bar/foo.html", + RewriteResult( + "https://kiwix.org/a/bar/foo.html", + "../bar/foo.html", + ZimPath("kiwix.org/a/bar/foo.html"), + ), ["kiwix.org/a/bar/foo.html"], False, ), ( # we cannot go upper than host, so '../' in excess are removed "https://kiwix.org/a/article/document.html", "../../../../../foo.html", - "../../foo.html", + RewriteResult( + "https://kiwix.org/foo.html", + "../../foo.html", + ZimPath("kiwix.org/foo.html"), + ), ["kiwix.org/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "foo?param=value", - "foo%3Fparam%3Dvalue", + RewriteResult( + "https://kiwix.org/a/article/foo?param=value", + "foo%3Fparam%3Dvalue", + ZimPath("kiwix.org/a/article/foo?param=value"), + ), ["kiwix.org/a/article/foo?param=value"], False, ), ( "https://kiwix.org/a/article/document.html", "foo?param=value%2F", - "foo%3Fparam%3Dvalue/", + RewriteResult( + "https://kiwix.org/a/article/foo?param=value%2F", + "foo%3Fparam%3Dvalue/", + ZimPath("kiwix.org/a/article/foo?param=value/"), + ), ["kiwix.org/a/article/foo?param=value/"], False, ), ( "https://kiwix.org/a/article/document.html", "foo?param=value%2Fend", - "foo%3Fparam%3Dvalue/end", + RewriteResult( + "https://kiwix.org/a/article/foo?param=value%2Fend", + "foo%3Fparam%3Dvalue/end", + ZimPath("kiwix.org/a/article/foo?param=value/end"), + ), ["kiwix.org/a/article/foo?param=value/end"], False, ), ( "https://kiwix.org/a/article/document.html", "foo/", - "foo/", + RewriteResult( + "https://kiwix.org/a/article/foo/", + "foo/", + ZimPath("kiwix.org/a/article/foo/"), + ), ["kiwix.org/a/article/foo/"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo o.html", - "../../fo%20o.html", + RewriteResult( + "https://kiwix.org/fo o.html", + "../../fo%20o.html", + ZimPath("kiwix.org/fo o.html"), + ), ["kiwix.org/fo o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo+o.html", - "../../fo%2Bo.html", + RewriteResult( + "https://kiwix.org/fo+o.html", + "../../fo%2Bo.html", + ZimPath("kiwix.org/fo+o.html"), + ), ["kiwix.org/fo+o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo%2Bo.html", - "../../fo%2Bo.html", + RewriteResult( + "https://kiwix.org/fo%2Bo.html", + "../../fo%2Bo.html", + ZimPath("kiwix.org/fo+o.html"), + ), ["kiwix.org/fo+o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/foo.html?param=val+ue", - "../../foo.html%3Fparam%3Dval%20ue", + RewriteResult( + "https://kiwix.org/foo.html?param=val+ue", + "../../foo.html%3Fparam%3Dval%20ue", + ZimPath("kiwix.org/foo.html?param=val ue"), + ), ["kiwix.org/foo.html?param=val ue"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo~o.html", - "../../fo~o.html", + RewriteResult( + "https://kiwix.org/fo~o.html", + "../../fo~o.html", + ZimPath("kiwix.org/fo~o.html"), + ), ["kiwix.org/fo~o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo-o.html", - "../../fo-o.html", + RewriteResult( + "https://kiwix.org/fo-o.html", + "../../fo-o.html", + ZimPath("kiwix.org/fo-o.html"), + ), ["kiwix.org/fo-o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo_o.html", - "../../fo_o.html", + RewriteResult( + "https://kiwix.org/fo_o.html", + "../../fo_o.html", + ZimPath("kiwix.org/fo_o.html"), + ), ["kiwix.org/fo_o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo%7Eo.html", # must not be encoded / must be decoded (RFC 3986 #2.3) - "../../fo~o.html", + RewriteResult( + "https://kiwix.org/fo%7Eo.html", + "../../fo~o.html", + ZimPath("kiwix.org/fo~o.html"), + ), ["kiwix.org/fo~o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo%2Do.html", # must not be encoded / must be decoded (RFC 3986 #2.3) - "../../fo-o.html", + RewriteResult( + "https://kiwix.org/fo%2Do.html", + "../../fo-o.html", + ZimPath("kiwix.org/fo-o.html"), + ), ["kiwix.org/fo-o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/fo%5Fo.html", # must not be encoded / must be decoded (RFC 3986 #2.3) - "../../fo_o.html", + RewriteResult( + "https://kiwix.org/fo%5Fo.html", + "../../fo_o.html", + ZimPath("kiwix.org/fo_o.html"), + ), ["kiwix.org/fo_o.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/foo%2Ehtml", # must not be encoded / must be decoded (RFC 3986 #2.3) - "../../foo.html", + RewriteResult( + "https://kiwix.org/foo%2Ehtml", + "../../foo.html", + ZimPath("kiwix.org/foo.html"), + ), ["kiwix.org/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "#anchor1", - "#anchor1", + RewriteResult( + "https://kiwix.org/a/article/document.html#anchor1", + "#anchor1", + None, + ), ["kiwix.org/a/article/document.html"], False, ), ( "https://kiwix.org/a/article/", "#anchor1", - "#anchor1", + RewriteResult( + "https://kiwix.org/a/article/#anchor1", + "#anchor1", + None, + ), ["kiwix.org/a/article/"], False, ), ( "https://kiwix.org/a/article/", "../article/", - "./", + RewriteResult( + "https://kiwix.org/a/article/", + "./", + ZimPath("kiwix.org/a/article/"), + ), ["kiwix.org/a/article/"], False, ), @@ -388,7 +524,7 @@ def test_relative_url( article_url: str, know_paths: list[str], original_content_url: str, - expected_rewriten_content_url: str, + expected_rewrite_result: RewriteResult, *, rewrite_all_url: bool, ): @@ -401,31 +537,43 @@ def test_relative_url( rewriter( original_content_url, base_href=None, rewrite_all_url=rewrite_all_url ) - == expected_rewriten_content_url + == expected_rewrite_result ) @pytest.mark.parametrize( - "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "article_url, original_content_url, expected_rewrite_result, know_paths, " "rewrite_all_url", [ ( "https://kiwix.org/a/article/document.html", "/foo.html", - "../../foo.html", + RewriteResult( + "https://kiwix.org/foo.html", + "../../foo.html", + ZimPath("kiwix.org/foo.html"), + ), ["kiwix.org/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/bar.html", - "https://kiwix.org/bar.html", # Full URL since not in known URLs + RewriteResult( + "https://kiwix.org/bar.html", + "https://kiwix.org/bar.html", # Full URL since not in known URLs + ZimPath("kiwix.org/bar.html"), + ), ["kiwix.org/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "/bar.html", - "../../bar.html", # all URLs rewrite activated + RewriteResult( + "https://kiwix.org/bar.html", + "../../bar.html", # all URLs rewrite activated + ZimPath("kiwix.org/bar.html"), + ), ["kiwix.org/foo.html"], True, ), @@ -436,7 +584,7 @@ def test_absolute_path_url( article_url: str, know_paths: list[str], original_content_url: str, - expected_rewriten_content_url: str, + expected_rewrite_result: RewriteResult, *, rewrite_all_url: bool, ): @@ -449,66 +597,98 @@ def test_absolute_path_url( rewriter( original_content_url, base_href=None, rewrite_all_url=rewrite_all_url ) - == expected_rewriten_content_url + == expected_rewrite_result ) @pytest.mark.parametrize( - "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "article_url, original_content_url, expected_rewrite_result, know_paths, " "rewrite_all_url", [ ( "https://kiwix.org/a/article/document.html", "//kiwix.org/foo.html", - "../../foo.html", + RewriteResult( + "https://kiwix.org/foo.html", + "../../foo.html", + ZimPath("kiwix.org/foo.html"), + ), ["kiwix.org/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "//kiwix.org/bar.html", - "https://kiwix.org/bar.html", # Full URL since not in known URLs + RewriteResult( + "https://kiwix.org/bar.html", + "https://kiwix.org/bar.html", # Full URL since not in known URLs + ZimPath("kiwix.org/bar.html"), + ), ["kiwix.org/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "//kiwix.org/bar.html", - "../../bar.html", # all URLs rewrite activated + RewriteResult( + "https://kiwix.org/bar.html", + "../../bar.html", # all URLs rewrite activated + ZimPath("kiwix.org/bar.html"), + ), ["kiwix.org/foo.html"], True, ), ( "https://kiwix.org/a/article/document.html", "//acme.com/foo.html", - "../../../acme.com/foo.html", + RewriteResult( + "https://acme.com/foo.html", + "../../../acme.com/foo.html", + ZimPath("acme.com/foo.html"), + ), ["acme.com/foo.html"], False, ), ( "http://kiwix.org/a/article/document.html", "//acme.com/bar.html", - "http://acme.com/bar.html", # Full URL since not in known URLs + RewriteResult( + "http://acme.com/bar.html", + "http://acme.com/bar.html", # Full URL since not in known URLs + ZimPath("acme.com/bar.html"), + ), ["kiwix.org/foo.html"], False, ), ( "https://kiwix.org/a/article/document.html", "//acme.com/bar.html", - "../../../acme.com/bar.html", # all URLs rewrite activated + RewriteResult( + "https://acme.com/bar.html", + "../../../acme.com/bar.html", # all URLs rewrite activated + ZimPath("acme.com/bar.html"), + ), ["kiwix.org/foo.html"], True, ), ( # puny-encoded host is transformed into url-encoded value "https://kiwix.org/a/article/document.html", "//xn--exmple-cva.com/a/article/document.html", - "../../../ex%C3%A9mple.com/a/article/document.html", + RewriteResult( + "https://xn--exmple-cva.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ZimPath("exémple.com/a/article/document.html"), + ), ["exémple.com/a/article/document.html"], False, ), ( # host who should be puny-encoded ir transformed into url-encoded value "https://kiwix.org/a/article/document.html", "//exémple.com/a/article/document.html", - "../../../ex%C3%A9mple.com/a/article/document.html", + RewriteResult( + "https://exémple.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ZimPath("exémple.com/a/article/document.html"), + ), ["exémple.com/a/article/document.html"], False, ), @@ -519,7 +699,7 @@ def test_absolute_scheme_url( article_url: str, know_paths: list[str], original_content_url: str, - expected_rewriten_content_url: str, + expected_rewrite_result: RewriteResult, *, rewrite_all_url: bool, ): @@ -532,68 +712,109 @@ def test_absolute_scheme_url( rewriter( original_content_url, base_href=None, rewrite_all_url=rewrite_all_url ) - == expected_rewriten_content_url + == expected_rewrite_result ) @pytest.mark.parametrize( - "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "article_url, original_content_url, expected_rewriten_result, know_paths, " "rewrite_all_url", [ - ( + pytest.param( "https://kiwix.org/a/article/document.html", "https://foo.org/a/article/document.html", - "../../../foo.org/a/article/document.html", + RewriteResult( + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ZimPath("foo.org/a/article/document.html"), + ), ["foo.org/a/article/document.html"], False, + id="simple", ), - ( + pytest.param( "https://kiwix.org/a/article/document.html", "http://foo.org/a/article/document.html", - "../../../foo.org/a/article/document.html", + RewriteResult( + "http://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ZimPath("foo.org/a/article/document.html"), + ), ["foo.org/a/article/document.html"], False, + id="http_https", ), - ( + pytest.param( "http://kiwix.org/a/article/document.html", "https://foo.org/a/article/document.html", - "../../../foo.org/a/article/document.html", + RewriteResult( + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ZimPath("foo.org/a/article/document.html"), + ), ["foo.org/a/article/document.html"], False, + id="https_http", ), - ( + pytest.param( "http://kiwix.org/a/article/document.html", "https://user:password@foo.org:8080/a/article/document.html", - "../../../foo.org/a/article/document.html", + RewriteResult( + "https://user:password@foo.org:8080/a/article/document.html", + "../../../foo.org/a/article/document.html", + ZimPath("foo.org/a/article/document.html"), + ), ["foo.org/a/article/document.html"], False, + id="user_pass", ), - ( # Full URL since not in known URLs + pytest.param( # Full URL since not in known URLs "https://kiwix.org/a/article/document.html", "https://foo.org/a/article/document.html", - "https://foo.org/a/article/document.html", + RewriteResult( + "https://foo.org/a/article/document.html", + "https://foo.org/a/article/document.html", + ZimPath("foo.org/a/article/document.html"), + ), ["kiwix.org/a/article/foo/"], False, + id="not_known", ), - ( # all URLs rewrite activated + pytest.param( # all URLs rewrite activated "https://kiwix.org/a/article/document.html", "https://foo.org/a/article/document.html", - "../../../foo.org/a/article/document.html", + RewriteResult( + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ZimPath("foo.org/a/article/document.html"), + ), ["kiwix.org/a/article/foo/"], True, + id="not_known_rewrite_all", ), - ( # puny-encoded host is transformed into url-encoded value + pytest.param( # puny-encoded host is transformed into url-encoded value "https://kiwix.org/a/article/document.html", "https://xn--exmple-cva.com/a/article/document.html", - "../../../ex%C3%A9mple.com/a/article/document.html", + RewriteResult( + "https://xn--exmple-cva.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ZimPath("exémple.com/a/article/document.html"), + ), ["exémple.com/a/article/document.html"], False, + id="punny_encoded", ), - ( # host who should be puny-encoded is transformed into url-encoded value + pytest.param( # host who should be puny-encoded is transformed into + # url-encoded value "https://kiwix.org/a/article/document.html", "https://exémple.com/a/article/document.html", - "../../../ex%C3%A9mple.com/a/article/document.html", + RewriteResult( + "https://exémple.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ZimPath("exémple.com/a/article/document.html"), + ), ["exémple.com/a/article/document.html"], False, + id="should_be_punny_encoded", ), ], ) @@ -602,7 +823,7 @@ def test_absolute_url( article_url: str, know_paths: list[str], original_content_url: str, - expected_rewriten_content_url: str, + expected_rewriten_result: RewriteResult, *, rewrite_all_url: bool, ): @@ -615,7 +836,7 @@ def test_absolute_url( rewriter( original_content_url, base_href=None, rewrite_all_url=rewrite_all_url ) - == expected_rewriten_content_url + == expected_rewriten_result ) @pytest.mark.parametrize( @@ -637,42 +858,55 @@ def test_no_rewrite_other_schemes( ): article_url = HttpUrl("https://kiwix.org/a/article/document.html") rewriter = ArticleUrlRewriter(article_url=article_url) - assert ( - rewriter( - original_content_url, base_href=None, rewrite_all_url=rewrite_all_url - ) - == original_content_url - ) + assert rewriter( + original_content_url, base_href=None, rewrite_all_url=rewrite_all_url + ) == RewriteResult(original_content_url, original_content_url, None) @pytest.mark.parametrize( - "original_content_url, know_path, base_href, expected_rewriten_content_url", + "original_content_url, know_path, base_href, expected_rewriten_result", [ pytest.param( "foo.html", "kiwix.org/a/article/foo.html", None, - "foo.html", + RewriteResult( + "https://kiwix.org/a/article/foo.html", + "foo.html", + ZimPath("kiwix.org/a/article/foo.html"), + ), id="no_base", ), pytest.param( "foo.html", "kiwix.org/a/foo.html", "../", - "../foo.html", + RewriteResult( + "https://kiwix.org/a/foo.html", + "../foo.html", + ZimPath("kiwix.org/a/foo.html"), + ), id="parent_base", ), pytest.param( "foo.html", "kiwix.org/a/bar/foo.html", "../bar/", - "../bar/foo.html", + RewriteResult( + "https://kiwix.org/a/bar/foo.html", + "../bar/foo.html", + ZimPath("kiwix.org/a/bar/foo.html"), + ), id="base_in_another_folder", ), pytest.param( "foo.html", "www.example.com/foo.html", "https://www.example.com/", - "../../../www.example.com/foo.html", + RewriteResult( + "https://www.example.com/foo.html", + "../../../www.example.com/foo.html", + ZimPath("www.example.com/foo.html"), + ), id="base_on_absolute_url", ), ], @@ -682,7 +916,7 @@ def test_base_href( original_content_url: str, know_path: str, base_href: str, - expected_rewriten_content_url: str, + expected_rewriten_result: str, ): rewriter = ArticleUrlRewriter( article_url=HttpUrl("https://kiwix.org/a/article/document.html"), @@ -690,8 +924,66 @@ def test_base_href( ) assert ( rewriter(original_content_url, base_href=base_href, rewrite_all_url=False) - == expected_rewriten_content_url + == expected_rewriten_result + ) + + +class CustomRewriter(ArticleUrlRewriter): + + def __init__( + self, + *, + article_url: HttpUrl, + article_path: ZimPath | None = None, + existing_zim_paths: set[ZimPath] | None = None, + missing_zim_paths: set[ZimPath] | None = None, + ): + super().__init__( + article_url=article_url, + article_path=article_path, + existing_zim_paths=existing_zim_paths, + missing_zim_paths=missing_zim_paths, + ) + self.items_to_download: dict[ZimPath, set[HttpUrl]] = {} + + def __call__( + self, item_url: str, base_href: str | None, *, rewrite_all_url: bool = True + ) -> RewriteResult: + result = super().__call__(item_url, base_href, rewrite_all_url=rewrite_all_url) + if result.zim_path is None: + return result + if self.existing_zim_paths and result.zim_path not in self.existing_zim_paths: + return result + if result.zim_path in self.items_to_download: + self.items_to_download[result.zim_path].add(HttpUrl(result.absolute_url)) + else: + self.items_to_download[result.zim_path] = {HttpUrl(result.absolute_url)} + return result + + +class TestCustomRewriter: + + def test_items_to_download(self): + rewriter = CustomRewriter( + article_url=HttpUrl("https://kiwix.org/a/article/document.html"), + existing_zim_paths={ + ZimPath("kiwix.org/a/article/foo.html"), + ZimPath("kiwix.org/a/article/bar.html"), + }, ) + rewriter("foo.html#anchor1", base_href=None, rewrite_all_url=False) + rewriter("foo.html#anchor2", base_href=None, rewrite_all_url=False) + rewriter("bar.html", base_href=None, rewrite_all_url=False) + rewriter("missing.html", base_href=None, rewrite_all_url=False) + assert rewriter.items_to_download == { + ZimPath("kiwix.org/a/article/foo.html"): { + HttpUrl("https://kiwix.org/a/article/foo.html#anchor1"), + HttpUrl("https://kiwix.org/a/article/foo.html#anchor2"), + }, + ZimPath("kiwix.org/a/article/bar.html"): { + HttpUrl("https://kiwix.org/a/article/bar.html") + }, + } class TestHttpUrl: