Skip to content

Commit

Permalink
iter
Browse files Browse the repository at this point in the history
  • Loading branch information
glemaitre committed Apr 11, 2024
1 parent ec1e289 commit b684a5b
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
15 changes: 11 additions & 4 deletions ragger_duck/scraping/_user_guide.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@

from ._shared import _chunk_document

SKLEARN_USER_GUIDE_URL = "https://scikit-learn.org/stable/modules/"
SKLEARN_USER_GUIDE_URL = {
"default": "https://scikit-learn.org/stable/",
"computing.html": "https://scikit-learn.org/stable/computing/",
"datasets.html": "https://scikit-learn.org/stable/datasets/",
}
loogger = logging.getLogger(__name__)


Expand All @@ -31,7 +35,9 @@ def _user_guide_path_to_user_guide_url(path):
str
The User Guide URL.
"""
return SKLEARN_USER_GUIDE_URL + path.name
if path.name in SKLEARN_USER_GUIDE_URL:
return SKLEARN_USER_GUIDE_URL[path.name]
return SKLEARN_USER_GUIDE_URL["default"] + path.name


def extract_user_guide_doc_from_single_file(html_file):
Expand Down Expand Up @@ -64,8 +70,9 @@ def extract_user_guide_doc_from_single_file(html_file):
with open(html_file, "r") as file:
soup = BeautifulSoup(file, "html.parser")

if soup.find("section") is not None:
text = soup.find("section").get_text("")
text = soup.find("section")
if text is not None:
text = text.get_text("")
else:
return {}
# Remove line breaks within a paragraph
Expand Down
14 changes: 13 additions & 1 deletion scripts/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,19 @@
USER_GUIDE_DOC_PATH = (
"/Users/glemaitre/Documents/packages/scikit-learn/doc/_build/html/stable"
)
USER_GUIDE_EXCLUDE_FOLDERS = ["modules/generated", "auto_examples/", "tutorial/"]
USER_GUIDE_EXCLUDE_FOLDERS = [
"_downloads/",
"_images/",
"_sources/",
"_static/",
"auto_examples/",
"binder/",
"modules/generated",
"notebooks/",
"sg_execution_times",
"testimonials/",
"tutorial/",
]

# Path to cache the embedding and models
CACHE_PATH = "../models"
Expand Down

0 comments on commit b684a5b

Please sign in to comment.