From 22abfbc61ed507db91d77ab0e40121810737b921 Mon Sep 17 00:00:00 2001 From: Denis Shulyaka Date: Thu, 12 Dec 2024 20:28:05 +0300 Subject: [PATCH] Include page title --- .../powerllm/tools/web_scrape.py | 27 ++++++++++++++----- tests/tools/test_web_scrape.py | 8 ++++-- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/custom_components/powerllm/tools/web_scrape.py b/custom_components/powerllm/tools/web_scrape.py index 3eef828..d3e3e59 100644 --- a/custom_components/powerllm/tools/web_scrape.py +++ b/custom_components/powerllm/tools/web_scrape.py @@ -11,25 +11,40 @@ _LOGGER = logging.getLogger(__name__) +REMOVE_KEYS = { + "hostname", + "fingerprint", + "id", + "raw_text", + "language", + "image", + "pagetype", + "filedate", + "source", + "source-hostname", + "tags", +} + + def setup(hass: HomeAssistant): """Register the tool on integration startup.""" @llm_tool(hass) def web_scrape(url: str): """Get latest content of a web page.""" - downloaded = trafilatura.fetch_url("linux.org.ru") + downloaded = trafilatura.fetch_url(url=url) parsed = trafilatura.extract( downloaded, + url=url, output_format="json", include_links=True, deduplicate=True, - favor_precision=True, + favor_precision=False, + favor_recall=True, + with_metadata=True, ) result = json.loads(parsed) - if "comments" in result and not result["comments"]: - del result["comments"] - - return result + return {k: v for k, v in result.items() if v and k not in REMOVE_KEYS} diff --git a/tests/tools/test_web_scrape.py b/tests/tools/test_web_scrape.py index ee933e8..88ee1e7 100644 --- a/tests/tools/test_web_scrape.py +++ b/tests/tools/test_web_scrape.py @@ -24,7 +24,11 @@ async def test_web_scrape_tool(async_call_tool) -> None: """ - with patch("trafilatura.fetch_url", return_value=helloworld): + with patch("trafilatura.fetch_url", return_value=helloworld) as mock_fetch: response = await async_call_tool("web_scrape", url="example.com") - assert response == {"text": "This is a simple HTML page with a greeting message."} + mock_fetch.assert_called_once_with(url="example.com") + assert response == { + "title": "Hello, World!", + "text": "Hello, World!\nThis is a simple HTML page with a greeting message.", + }