Skip to content

Commit

Permalink
Return complete url rewrite information so that 'users' can decide wh…
Browse files Browse the repository at this point in the history
…at they want to do
  • Loading branch information
benoit74 committed Oct 24, 2024
1 parent d6a297b commit afac053
Show file tree
Hide file tree
Showing 6 changed files with 434 additions and 113 deletions.
8 changes: 5 additions & 3 deletions src/zimscraperlib/rewriting/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __simple_transform(
[
"url(",
m_object["quote"],
url_rewriter(m_object["url"], base_href),
url_rewriter(m_object["url"], base_href).rewriten_url,
m_object["quote"],
")",
]
Expand Down Expand Up @@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node):
new_url = self.url_rewriter(
url_node.value, # pyright: ignore
self.base_href,
)
).rewriten_url
url_node.value = str(new_url) # pyright: ignore
url_node.representation = ( # pyright: ignore
f'"{serialize_url(str(new_url))}"'
Expand All @@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node):
elif isinstance(node, ast.Declaration):
self._process_list(node.value) # pyright: ignore
elif isinstance(node, ast.URLToken):
new_url = self.url_rewriter(node.value, self.base_href) # pyright: ignore
new_url = self.url_rewriter(
node.value, self.base_href
).rewriten_url # pyright: ignore
node.value = new_url
node.representation = f"url({serialize_url(new_url)})"

Expand Down
11 changes: 7 additions & 4 deletions src/zimscraperlib/rewriting/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,9 @@ def rewrite_href_src_attributes(
notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
return (
attr_name,
url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
url_rewriter(
attr_value, base_href=base_href, rewrite_all_url=tag != "a"
).rewriten_url,
)


Expand All @@ -618,10 +620,10 @@ def rewrite_srcset_attribute(
if attr_name != "srcset" or not attr_value:
return
value_list = attr_value.split(",")
new_value_list = []
new_value_list: list[str] = []
for value in value_list:
url, *other = value.strip().split(" ", maxsplit=1)
new_url = url_rewriter(url, base_href=base_href)
new_url = url_rewriter(url, base_href=base_href).rewriten_url
new_value = " ".join([new_url, *other])
new_value_list.append(new_value)
return (attr_name, ", ".join(new_value_list))
Expand Down Expand Up @@ -711,5 +713,6 @@ def rewrite_meta_http_equiv_redirect(
return
return (
attr_name,
f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
f"{match['interval']};"
f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}",
)
2 changes: 1 addition & 1 deletion src/zimscraperlib/rewriting/js.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str:
This takes into account that the result must be a relative URL, i.e. it
cannot be 'vendor.module.js' but must be './vendor.module.js'.
"""
url = self.url_rewriter(url, base_href=self.base_href)
url = self.url_rewriter(url, base_href=self.base_href).rewriten_url
if not (
url.startswith("/") or url.startswith("./") or url.startswith("../")
):
Expand Down
53 changes: 36 additions & 17 deletions src/zimscraperlib/rewriting/url_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None:
raise ValueError(f"Unexpected password in value: {value} {parts.password}")


class RewriteResult(NamedTuple):
absolute_url: str
rewriten_url: str
zim_path: ZimPath | None


class ArticleUrlRewriter:
"""
Rewrite urls in article.
Expand Down Expand Up @@ -176,16 +182,11 @@ def __init__(
missing_zim_paths: list of ZIM paths which are known to already be missing
from the existing_zim_paths ; usefull only in complement with this variable ;
new missing entries will be added as URLs are normalized in this function
Results:
items_to_download: populated with the list of rewritten URLs, so that one
might use it to download items after rewriting the document
"""
self.article_path = article_path or ArticleUrlRewriter.normalize(article_url)
self.article_url = article_url
self.existing_zim_paths = existing_zim_paths
self.missing_zim_paths = missing_zim_paths
self.items_to_download: dict[ZimPath, HttpUrl] = {}

def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
"""Utility to transform an item URL into a ZimPath"""
Expand All @@ -201,7 +202,7 @@ def __call__(
base_href: str | None,
*,
rewrite_all_url: bool = True,
) -> str:
) -> RewriteResult:
"""Rewrite a url contained in a article.
The url is "fully" rewrited to point to a normalized entry path
Expand All @@ -210,17 +211,25 @@ def __call__(
try:
item_url = item_url.strip()

item_absolute_url = urljoin(
urljoin(self.article_url.value, base_href), item_url
)

# Make case of standalone fragments more straightforward
if item_url.startswith("#"):
return item_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

item_scheme = urlsplit(item_url).scheme
if item_scheme and item_scheme not in ("http", "https"):
return item_url

item_absolute_url = urljoin(
urljoin(self.article_url.value, base_href), item_url
)
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

item_fragment = urlsplit(item_absolute_url).fragment

Expand All @@ -229,9 +238,11 @@ def __call__(
if rewrite_all_url or (
self.existing_zim_paths and item_path in self.existing_zim_paths
):
if item_path not in self.items_to_download:
self.items_to_download[item_path] = HttpUrl(item_absolute_url)
return self.get_document_uri(item_path, item_fragment)
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=self.get_document_uri(item_path, item_fragment),
zim_path=item_path,
)
else:
if (
self.missing_zim_paths is not None
Expand All @@ -242,7 +253,11 @@ def __call__(
# with duplicate messages
self.missing_zim_paths.add(item_path)
# The url doesn't point to a known entry
return item_absolute_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_absolute_url,
zim_path=item_path,
)

except Exception as exc: # pragma: no cover
item_scheme = (
Expand Down Expand Up @@ -275,7 +290,11 @@ def __call__(
f"rewrite_all_url: {rewrite_all_url}",
exc_info=exc,
)
return item_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
"""Given an ZIM item path and its fragment, get the URI to use in document
Expand Down
9 changes: 7 additions & 2 deletions tests/rewriting/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from zimscraperlib.rewriting.url_rewriting import (
ArticleUrlRewriter,
HttpUrl,
RewriteResult,
ZimPath,
)

Expand All @@ -24,8 +25,12 @@ def __call__(
base_href: str | None, # noqa: ARG002
*,
rewrite_all_url: bool = True, # noqa: ARG002
) -> str:
return item_url + self.suffix
) -> RewriteResult:
return RewriteResult(
absolute_url=item_url + self.suffix,
rewriten_url=item_url + self.suffix,
zim_path=None,
)

def get_item_path(
self, item_url: str, base_href: str | None # noqa: ARG002
Expand Down
Loading

0 comments on commit afac053

Please sign in to comment.