Skip to content

Commit

Permalink
Include page title
Browse files Browse the repository at this point in the history
  • Loading branch information
Shulyaka committed Dec 12, 2024
1 parent 29add0b commit 22abfbc
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 8 deletions.
27 changes: 21 additions & 6 deletions custom_components/powerllm/tools/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,40 @@
_LOGGER = logging.getLogger(__name__)


REMOVE_KEYS = {
"hostname",
"fingerprint",
"id",
"raw_text",
"language",
"image",
"pagetype",
"filedate",
"source",
"source-hostname",
"tags",
}


def setup(hass: HomeAssistant):
"""Register the tool on integration startup."""

@llm_tool(hass)
def web_scrape(url: str):
"""Get latest content of a web page."""
downloaded = trafilatura.fetch_url("linux.org.ru")
downloaded = trafilatura.fetch_url(url=url)

parsed = trafilatura.extract(
downloaded,
url=url,
output_format="json",
include_links=True,
deduplicate=True,
favor_precision=True,
favor_precision=False,
favor_recall=True,
with_metadata=True,
)

result = json.loads(parsed)

if "comments" in result and not result["comments"]:
del result["comments"]

return result
return {k: v for k, v in result.items() if v and k not in REMOVE_KEYS}
8 changes: 6 additions & 2 deletions tests/tools/test_web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ async def test_web_scrape_tool(async_call_tool) -> None:
</html>
"""

with patch("trafilatura.fetch_url", return_value=helloworld):
with patch("trafilatura.fetch_url", return_value=helloworld) as mock_fetch:
response = await async_call_tool("web_scrape", url="example.com")

assert response == {"text": "This is a simple HTML page with a greeting message."}
mock_fetch.assert_called_once_with(url="example.com")
assert response == {
"title": "Hello, World!",
"text": "Hello, World!\nThis is a simple HTML page with a greeting message.",
}

0 comments on commit 22abfbc

Please sign in to comment.