Return complete url rewrite information so that 'users' can decide wh…

…at they want to do
openzim · Oct 24, 2024 · afac053 · afac053
1 parent d6a297b
commit afac053
Show file tree

Hide file tree

Showing 6 changed files with 434 additions and 113 deletions.
diff --git a/src/zimscraperlib/rewriting/css.py b/src/zimscraperlib/rewriting/css.py
@@ -51,7 +51,7 @@ def __simple_transform(
             [
                 "url(",
                 m_object["quote"],
-                url_rewriter(m_object["url"], base_href),
+                url_rewriter(m_object["url"], base_href).rewriten_url,
                 m_object["quote"],
                 ")",
             ]
@@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node):
                 new_url = self.url_rewriter(
                     url_node.value,  # pyright: ignore
                     self.base_href,
-                )
+                ).rewriten_url
                 url_node.value = str(new_url)  # pyright: ignore
                 url_node.representation = (  # pyright: ignore
                     f'"{serialize_url(str(new_url))}"'
@@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node):
         elif isinstance(node, ast.Declaration):
             self._process_list(node.value)  # pyright: ignore
         elif isinstance(node, ast.URLToken):
-            new_url = self.url_rewriter(node.value, self.base_href)  # pyright: ignore
+            new_url = self.url_rewriter(
+                node.value, self.base_href
+            ).rewriten_url  # pyright: ignore
             node.value = new_url
             node.representation = f"url({serialize_url(new_url)})"
 

diff --git a/src/zimscraperlib/rewriting/html.py b/src/zimscraperlib/rewriting/html.py
@@ -603,7 +603,9 @@ def rewrite_href_src_attributes(
         notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
     return (
         attr_name,
-        url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
+        url_rewriter(
+            attr_value, base_href=base_href, rewrite_all_url=tag != "a"
+        ).rewriten_url,
     )
 
 
@@ -618,10 +620,10 @@ def rewrite_srcset_attribute(
     if attr_name != "srcset" or not attr_value:
         return
     value_list = attr_value.split(",")
-    new_value_list = []
+    new_value_list: list[str] = []
     for value in value_list:
         url, *other = value.strip().split(" ", maxsplit=1)
-        new_url = url_rewriter(url, base_href=base_href)
+        new_url = url_rewriter(url, base_href=base_href).rewriten_url
         new_value = " ".join([new_url, *other])
         new_value_list.append(new_value)
     return (attr_name, ", ".join(new_value_list))
@@ -711,5 +713,6 @@ def rewrite_meta_http_equiv_redirect(
         return
     return (
         attr_name,
-        f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
+        f"{match['interval']};"
+        f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}",
     )
diff --git a/src/zimscraperlib/rewriting/js.py b/src/zimscraperlib/rewriting/js.py
@@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str:
             This takes into account that the result must be a relative URL, i.e. it
             cannot be 'vendor.module.js' but must be './vendor.module.js'.
             """
-            url = self.url_rewriter(url, base_href=self.base_href)
+            url = self.url_rewriter(url, base_href=self.base_href).rewriten_url
             if not (
                 url.startswith("/") or url.startswith("./") or url.startswith("../")
             ):

diff --git a/src/zimscraperlib/rewriting/url_rewriting.py b/src/zimscraperlib/rewriting/url_rewriting.py
@@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None:
             raise ValueError(f"Unexpected password in value: {value} {parts.password}")
 
 
+class RewriteResult(NamedTuple):
+    absolute_url: str
+    rewriten_url: str
+    zim_path: ZimPath | None
+
+
 class ArticleUrlRewriter:
     """
     Rewrite urls in article.
@@ -176,16 +182,11 @@ def __init__(
           missing_zim_paths: list of ZIM paths which are known to already be missing
         from the existing_zim_paths ; usefull only in complement with this variable ;
         new missing entries will be added as URLs are normalized in this function
-
-        Results:
-          items_to_download: populated with the list of rewritten URLs, so that one
-        might use it to download items after rewriting the document
         """
         self.article_path = article_path or ArticleUrlRewriter.normalize(article_url)
         self.article_url = article_url
         self.existing_zim_paths = existing_zim_paths
         self.missing_zim_paths = missing_zim_paths
-        self.items_to_download: dict[ZimPath, HttpUrl] = {}
 
     def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
         """Utility to transform an item URL into a ZimPath"""
@@ -201,7 +202,7 @@ def __call__(
         base_href: str | None,
         *,
         rewrite_all_url: bool = True,
-    ) -> str:
+    ) -> RewriteResult:
         """Rewrite a url contained in a article.
 
         The url is "fully" rewrited to point to a normalized entry path
@@ -210,17 +211,25 @@ def __call__(
         try:
             item_url = item_url.strip()
 
+            item_absolute_url = urljoin(
+                urljoin(self.article_url.value, base_href), item_url
+            )
+
             # Make case of standalone fragments more straightforward
             if item_url.startswith("#"):
-                return item_url
+                return RewriteResult(
+                    absolute_url=item_absolute_url,
+                    rewriten_url=item_url,
+                    zim_path=None,
+                )
 
             item_scheme = urlsplit(item_url).scheme
             if item_scheme and item_scheme not in ("http", "https"):
-                return item_url
-
-            item_absolute_url = urljoin(
-                urljoin(self.article_url.value, base_href), item_url
-            )
+                return RewriteResult(
+                    absolute_url=item_absolute_url,
+                    rewriten_url=item_url,
+                    zim_path=None,
+                )
 
             item_fragment = urlsplit(item_absolute_url).fragment
 
@@ -229,9 +238,11 @@ def __call__(
             if rewrite_all_url or (
                 self.existing_zim_paths and item_path in self.existing_zim_paths
             ):
-                if item_path not in self.items_to_download:
-                    self.items_to_download[item_path] = HttpUrl(item_absolute_url)
-                return self.get_document_uri(item_path, item_fragment)
+                return RewriteResult(
+                    absolute_url=item_absolute_url,
+                    rewriten_url=self.get_document_uri(item_path, item_fragment),
+                    zim_path=item_path,
+                )
             else:
                 if (
                     self.missing_zim_paths is not None
@@ -242,7 +253,11 @@ def __call__(
                     # with duplicate messages
                     self.missing_zim_paths.add(item_path)
                 # The url doesn't point to a known entry
-                return item_absolute_url
+                return RewriteResult(
+                    absolute_url=item_absolute_url,
+                    rewriten_url=item_absolute_url,
+                    zim_path=item_path,
+                )
 
         except Exception as exc:  # pragma: no cover
             item_scheme = (
@@ -275,7 +290,11 @@ def __call__(
                 f"rewrite_all_url: {rewrite_all_url}",
                 exc_info=exc,
             )
-            return item_url
+            return RewriteResult(
+                absolute_url=item_absolute_url,
+                rewriten_url=item_url,
+                zim_path=None,
+            )
 
     def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
         """Given an ZIM item path and its fragment, get the URI to use in document

diff --git a/tests/rewriting/conftest.py b/tests/rewriting/conftest.py
@@ -7,6 +7,7 @@
 from zimscraperlib.rewriting.url_rewriting import (
     ArticleUrlRewriter,
     HttpUrl,
+    RewriteResult,
     ZimPath,
 )
 
@@ -24,8 +25,12 @@ def __call__(
         base_href: str | None,  # noqa: ARG002
         *,
         rewrite_all_url: bool = True,  # noqa: ARG002
-    ) -> str:
-        return item_url + self.suffix
+    ) -> RewriteResult:
+        return RewriteResult(
+            absolute_url=item_url + self.suffix,
+            rewriten_url=item_url + self.suffix,
+            zim_path=None,
+        )
 
     def get_item_path(
         self, item_url: str, base_href: str | None  # noqa: ARG002