Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

SitemapIndexReader implementation #640

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,15 @@
"seo"
]
},
"SitemapIndexReader": {
"id": "web/sitemap_index",
"author": "kapil-malik",
"keywords": [
"sitemap-index",
"website",
"seo"
]
},
"DatabaseReader": {
"id": "database",
"author": "kevinqz",
Expand Down
52 changes: 52 additions & 0 deletions llama_hub/web/sitemap_index/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Sitemap Index Loader

This loader is an asynchronous web scraper that fetches the text from static websites
which have multiple sitemaps by using its sitemap index and optionally converting the HTML to text

It is based on the [Sitemap Loader](https://llamahub.ai/l/web-sitemap)

## Usage

To use this loader, you just declare the sitemap_index.xml url like this:

```python
from llama_hub.web.sitemap_index import SitemapIndexReader

# for jupyter notebooks uncomment the following two lines of code:
# import nest_asyncio
# nest_asyncio.apply()

loader = SitemapIndexReader()
documents = loader.load_data(sitemap_index_url='https://docs.aws.amazon.com/sitemap_index.xml')
```

Be sure that the sitemap_index_url contains a proper [Sitemap Index](https://www.sitemaps.org/protocol.html#index)

## Filter option

You can filter sitemaps from the sitemap index that are actually being crawled by adding the *filter* argument to the load_data method

```python
documents = loader.load_data(
sitemap_index_url='https://docs.aws.amazon.com/sitemap_index.xml',
sitemap_url_filters=["AmazonCloudWatch", "amazondynamodb"])
# only crawl sitemaps that contain these strings
```

## Issues Jupyter Notebooks asyncio

If you get a `RuntimeError: asyncio.run() cannot be called from a running event loop` you might be interested in this (solution here)[https://saturncloud.io/blog/asynciorun-cannot-be-called-from-a-running-event-loop-a-guide-for-data-scientists-using-jupyter-notebook/#option-3-use-nest_asyncio]


### Old Usage

use this syntax for earlier versions of llama_index where llama_hub loaders where loaded via separate download process:

```python
from llama_index import download_loader

SitemapIndexReader = download_loader("SitemapIndexReader")

loader = SitemapIndexReader()
documents = loader.load_data(sitemap_index_url='https://docs.aws.amazon.com/sitemap_index.xml')
```
6 changes: 6 additions & 0 deletions llama_hub/web/sitemap_index/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Init file."""
from llama_hub.web.sitemap_index.base import (
SitemapIndexReader,
)

__all__ = ["SitemapIndexReader"]
68 changes: 68 additions & 0 deletions llama_hub/web/sitemap_index/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import logging
import requests
import xmltodict
from typing import List

from llama_index import download_loader
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document

logger = logging.getLogger(__name__)

class SitemapIndexReader(BaseReader):
"""Sitemap Index reader. Reads data from a sitemap index document.

Args:
html_to_text (bool): Whether to convert HTML to text.
Requires `html2text` package.
limit (int): Maximum number of concurrent requests.
"""
def __init__(self, html_to_text: bool = False, limit: int = 10) -> None:
"""Initialize with parameters."""

try:
from llama_hub.utils import import_loader

SitemapReader = import_loader("SitemapReader")
except ImportError:
SitemapReader = download_loader("SitemapReader")

self._sitemap_loader = SitemapReader(html_to_text=html_to_text, limit=limit)
self._html_to_text = html_to_text
self._limit = limit


def read_sitemap_urls(self, sitemap_index_url:str, sitemap_url_filters: list):
sitemap_response = requests.get(sitemap_index_url)
sitemap_index_dict = xmltodict.parse(sitemap_response.text)

sitemap_entries = sitemap_index_dict['sitemapindex']['sitemap']
logger.info("Found %s sitemap entries", len(sitemap_entries))

sitemap_urls = []
for sitemap_entry in sitemap_entries:
if 'loc' not in sitemap_entry:
logging.info("Skipping sitemap entry without loc: %s", sitemap_entry)
continue
loc = sitemap_entry['loc']
if len(sitemap_url_filters) == 0 or any([url_filter in loc for url_filter in sitemap_url_filters]):
logging.info("Adding sitemap entry with loc: %s", loc)
sitemap_urls.append(loc)

return sitemap_urls

"""
Load data from a sitemap index document.

Args:
sitemap_index_url (str): URL of the sitemap index document.
sitemap_url_filters (list): List of URL filters to select sitemap urls from the index. Defaults to empty list.
"""
def load_data(self, sitemap_index_url: str, sitemap_url_filters: list = []) -> List[Document]:
sitemap_urls = self.read_sitemap_urls(sitemap_index_url, sitemap_url_filters)
sitemap_index_documents = []
for url in sitemap_urls:
sitemap_documents = self._sitemap_loader.load_data(sitemap_url=url)
logging.info("Loaded %s documents from %s", len(sitemap_documents), url)
sitemap_index_documents.extend(sitemap_documents)
return sitemap_index_documents
2 changes: 2 additions & 0 deletions llama_hub/web/sitemap_index/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests>=2.28.1
xmltodict>=0.12.0
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ atlassian-python-api = "*"
html2text = "*"
psutil = "*"
retrying = "*"
requests = ">=2.28.1"
xmltodict = ">=0.12.0"

[tool.poetry.dev-dependencies]
pytest = "7.2.1"
Expand Down
2 changes: 2 additions & 0 deletions test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ llama-index>=0.6.9
atlassian-python-api
html2text
olefile
requests>=2.28.1
xmltodict>=0.12.0

# hotfix
psutil
Expand Down
12 changes: 12 additions & 0 deletions tests/tests_web_sitemap_index/test_sitemap_index.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>https://gpt-index.readthedocs.io/en/sitemap.xml</loc>
</sitemap>
<sitemap>
<loc>https://gpt-index.readthedocs.io/fr/sitemap.xml</loc>
</sitemap>
<sitemap>
<loc>https://gpt-index.readthedocs.io/jp/sitemap.xml</loc>
</sitemap>
</sitemapindex>
93 changes: 93 additions & 0 deletions tests/tests_web_sitemap_index/test_web_sitemap_index_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import unittest
from unittest.mock import patch, Mock

import pytest
from llama_index.readers.schema.base import Document
from llama_hub.web.sitemap_index.base import SitemapIndexReader
import requests

MOCK_SITEMAP_INDEX_URL = "https://gpt-index.readthedocs.io/sitemap_index.xml"

SITEMAP_URL_REGION_MAP = {
"https://gpt-index.readthedocs.io/en/sitemap.xml": "EN",
"https://gpt-index.readthedocs.io/fr/sitemap.xml": "FR",
"https://gpt-index.readthedocs.io/jp/sitemap.xml": "JP"
}

SITE_URLS = [
"https://gpt-index.readthedocs.io/{region}/stable/",
"https://gpt-index.readthedocs.io/{region}/latest/",
"https://gpt-index.readthedocs.io/{region}/stable/quickstart/"]

def get_sitemap_index_data():
f = open("tests/tests_web_sitemap_index/test_sitemap_index.xml", "r")
return f.read()

def dummy_load_pages(sitemap_url: str):
region = SITEMAP_URL_REGION_MAP[sitemap_url]
urls = [url.format(region=region) for url in SITE_URLS]
documents = []
for url in urls:
doc = Document(text=f"Sample text in region: {region} for url: {url}", extra_info={"Source": url})
documents.append(doc)
return documents


class TestSitemapIndexReader(unittest.TestCase):
def test_sitemap_index_reader_init(self):
# test w/o args
SitemapIndexReader()

# test w args
SitemapIndexReader(html_to_text=True, limit=50)

def test_sitemap_reader_load_data_invalid_args(self):
sitemap_index_reader = SitemapIndexReader()

with pytest.raises(
TypeError,
match="missing 1 required positional argument: 'sitemap_index_url'",
):
sitemap_index_reader.load_data()

@patch("llama_hub.web.sitemap.base.SitemapReader.load_data")
def test_sitemap_index_reader_load_data(self, mock_load_data):
with patch("requests.get") as mock_requests_get:
sitemap_index_reader = SitemapIndexReader()

# mock sitemap call
mock_response = requests.Response()
mock_response.headers['Content-Type'] = 'text/plain'
mock_response._content = get_sitemap_index_data().encode('utf-8')
mock_response.status_code = 200
mock_requests_get.return_value = mock_response

mock_load_data.side_effect = dummy_load_pages

documents = sitemap_index_reader.load_data(sitemap_index_url=MOCK_SITEMAP_INDEX_URL)

mock_requests_get.assert_called_once_with(MOCK_SITEMAP_INDEX_URL)
assert mock_load_data.call_count == 3
assert len(documents) == 9

@patch("llama_hub.web.sitemap.base.SitemapReader.load_data")
def test_sitemap_index_reader_load_data_with_filter(self, mock_load_data):
with patch("requests.get") as mock_requests_get:
sitemap_index_reader = SitemapIndexReader()

# mock sitemap call
mock_response = requests.Response()
mock_response.headers['Content-Type'] = 'text/plain'
mock_response._content = get_sitemap_index_data().encode('utf-8')
mock_response.status_code = 200
mock_requests_get.return_value = mock_response

mock_load_data.side_effect = dummy_load_pages

documents = sitemap_index_reader.load_data(
sitemap_index_url=MOCK_SITEMAP_INDEX_URL,
sitemap_url_filters=["en", "fr"])

mock_requests_get.assert_called_once_with(MOCK_SITEMAP_INDEX_URL)
assert mock_load_data.call_count == 2
assert len(documents) == 6
Loading