Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More changes for mindtouch scraper #208

Merged
merged 3 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/zimscraperlib/rewriting/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __simple_transform(
[
"url(",
m_object["quote"],
url_rewriter(m_object["url"], base_href),
url_rewriter(m_object["url"], base_href).rewriten_url,
m_object["quote"],
")",
]
Expand Down Expand Up @@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node):
new_url = self.url_rewriter(
url_node.value, # pyright: ignore
self.base_href,
)
).rewriten_url
url_node.value = str(new_url) # pyright: ignore
url_node.representation = ( # pyright: ignore
f'"{serialize_url(str(new_url))}"'
Expand All @@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node):
elif isinstance(node, ast.Declaration):
self._process_list(node.value) # pyright: ignore
elif isinstance(node, ast.URLToken):
new_url = self.url_rewriter(node.value, self.base_href) # pyright: ignore
new_url = self.url_rewriter(
node.value, self.base_href
).rewriten_url # pyright: ignore
node.value = new_url
node.representation = f"url({serialize_url(new_url)})"

Expand Down
24 changes: 15 additions & 9 deletions src/zimscraperlib/rewriting/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@ class HtmlRewriter(HTMLParser):
def __init__(
self,
url_rewriter: ArticleUrlRewriter,
pre_head_insert: str,
pre_head_insert: str | None,
post_head_insert: str | None,
notify_js_module: Callable[[ZimPath], None],
notify_js_module: Callable[[ZimPath], None] | None,
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
):
super().__init__(convert_charrefs=False)
self.url_rewriter = url_rewriter
Expand Down Expand Up @@ -430,7 +430,7 @@ def do_attribute_rewrite(
css_rewriter: CssRewriter,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
notify_js_module: Callable[[ZimPath], None] | None,
) -> AttrNameAndValue:
"""Utility function to process all attribute rewriting rules

Expand Down Expand Up @@ -587,7 +587,7 @@ def rewrite_href_src_attributes(
attrs: AttrsList,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
notify_js_module: Callable[[ZimPath], None] | None,
):
"""Rewrite href and src attributes

Expand All @@ -596,11 +596,16 @@ def rewrite_href_src_attributes(
"""
if attr_name not in ("href", "src") or not attr_value:
return
if get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module":
if (
notify_js_module
and get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module"
):
notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
return (
attr_name,
url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
url_rewriter(
attr_value, base_href=base_href, rewrite_all_url=tag != "a"
).rewriten_url,
)


Expand All @@ -615,10 +620,10 @@ def rewrite_srcset_attribute(
if attr_name != "srcset" or not attr_value:
return
value_list = attr_value.split(",")
new_value_list = []
new_value_list: list[str] = []
for value in value_list:
url, *other = value.strip().split(" ", maxsplit=1)
new_url = url_rewriter(url, base_href=base_href)
new_url = url_rewriter(url, base_href=base_href).rewriten_url
new_value = " ".join([new_url, *other])
new_value_list.append(new_value)
return (attr_name, ", ".join(new_value_list))
Expand Down Expand Up @@ -708,5 +713,6 @@ def rewrite_meta_http_equiv_redirect(
return
return (
attr_name,
f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
f"{match['interval']};"
f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}",
)
13 changes: 7 additions & 6 deletions src/zimscraperlib/rewriting/js.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def __init__(
self,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
notify_js_module: Callable[[ZimPath], None] | None,
):
super().__init__(None)
self.first_buff = self._init_local_declaration(GLOBAL_OVERRIDES)
Expand Down Expand Up @@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str:
This takes into account that the result must be a relative URL, i.e. it
cannot be 'vendor.module.js' but must be './vendor.module.js'.
"""
url = self.url_rewriter(url, base_href=self.base_href)
url = self.url_rewriter(url, base_href=self.base_href).rewriten_url
if not (
url.startswith("/") or url.startswith("./") or url.startswith("../")
):
Expand All @@ -298,11 +298,12 @@ def func(
m_object: re.Match[str], _opts: dict[str, Any] | None = None
) -> str:
def sub_funct(match: re.Match[str]) -> str:
self.notify_js_module(
self.url_rewriter.get_item_path(
match.group(2), base_href=self.base_href
if self.notify_js_module:
self.notify_js_module(
self.url_rewriter.get_item_path(
match.group(2), base_href=self.base_href
)
)
)
return (
f"{match.group(1)}{get_rewriten_import_url(match.group(2))}"
f"{match.group(3)}"
Expand Down
57 changes: 38 additions & 19 deletions src/zimscraperlib/rewriting/url_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __str__(self) -> str:
return f"HttpUrl({self.value})"

def __repr__(self) -> str:
return f"{self.__str__} - {super().__repr__()}" # pragma: no cover
return f"HttpUrl({self.value})" # pragma: no cover

@property
def value(self) -> str:
Expand Down Expand Up @@ -124,7 +124,7 @@ def __str__(self) -> str:
return f"ZimPath({self.value})"

def __repr__(self) -> str:
return f"{self.__str__} - {super().__repr__()}" # pragma: no cover
return f"ZimPath({self.value})" # pragma: no cover

@property
def value(self) -> str:
Expand All @@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None:
raise ValueError(f"Unexpected password in value: {value} {parts.password}")


class RewriteResult(NamedTuple):
absolute_url: str
rewriten_url: str
zim_path: ZimPath | None


class ArticleUrlRewriter:
"""
Rewrite urls in article.
Expand Down Expand Up @@ -176,16 +182,11 @@ def __init__(
missing_zim_paths: list of ZIM paths which are known to already be missing
from the existing_zim_paths ; usefull only in complement with this variable ;
new missing entries will be added as URLs are normalized in this function

Results:
items_to_download: populated with the list of rewritten URLs, so that one
might use it to download items after rewriting the document
"""
self.article_path = article_path or ArticleUrlRewriter.normalize(article_url)
self.article_url = article_url
self.existing_zim_paths = existing_zim_paths
self.missing_zim_paths = missing_zim_paths
self.items_to_download: dict[ZimPath, HttpUrl] = {}

def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
"""Utility to transform an item URL into a ZimPath"""
Expand All @@ -201,7 +202,7 @@ def __call__(
base_href: str | None,
*,
rewrite_all_url: bool = True,
) -> str:
) -> RewriteResult:
"""Rewrite a url contained in a article.

The url is "fully" rewrited to point to a normalized entry path
Expand All @@ -210,17 +211,25 @@ def __call__(
try:
item_url = item_url.strip()

item_absolute_url = urljoin(
urljoin(self.article_url.value, base_href), item_url
)

# Make case of standalone fragments more straightforward
if item_url.startswith("#"):
return item_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

item_scheme = urlsplit(item_url).scheme
if item_scheme and item_scheme not in ("http", "https"):
return item_url

item_absolute_url = urljoin(
urljoin(self.article_url.value, base_href), item_url
)
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

item_fragment = urlsplit(item_absolute_url).fragment

Expand All @@ -229,9 +238,11 @@ def __call__(
if rewrite_all_url or (
self.existing_zim_paths and item_path in self.existing_zim_paths
):
if item_path not in self.items_to_download:
self.items_to_download[item_path] = HttpUrl(item_absolute_url)
return self.get_document_uri(item_path, item_fragment)
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=self.get_document_uri(item_path, item_fragment),
zim_path=item_path,
)
else:
if (
self.missing_zim_paths is not None
Expand All @@ -242,7 +253,11 @@ def __call__(
# with duplicate messages
self.missing_zim_paths.add(item_path)
# The url doesn't point to a known entry
return item_absolute_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_absolute_url,
zim_path=item_path,
)

except Exception as exc: # pragma: no cover
item_scheme = (
Expand Down Expand Up @@ -275,7 +290,11 @@ def __call__(
f"rewrite_all_url: {rewrite_all_url}",
exc_info=exc,
)
return item_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
"""Given an ZIM item path and its fragment, get the URI to use in document
Expand Down
19 changes: 7 additions & 12 deletions tests/rewriting/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,11 @@
from zimscraperlib.rewriting.url_rewriting import (
ArticleUrlRewriter,
HttpUrl,
RewriteResult,
ZimPath,
)


@pytest.fixture(scope="module")
def no_js_notify():
"""Fixture to not care about notification of detection of a JS file"""

def no_js_notify_handler(_: str):
pass

yield no_js_notify_handler


class SimpleUrlRewriter(ArticleUrlRewriter):
"""Basic URL rewriter mocking most calls"""

Expand All @@ -34,8 +25,12 @@ def __call__(
base_href: str | None, # noqa: ARG002
*,
rewrite_all_url: bool = True, # noqa: ARG002
) -> str:
return item_url + self.suffix
) -> RewriteResult:
return RewriteResult(
absolute_url=item_url + self.suffix,
rewriten_url=item_url + self.suffix,
zim_path=None,
)

def get_item_path(
self, item_url: str, base_href: str | None # noqa: ARG002
Expand Down
Loading