Skip to content

Commit

Permalink
iter
Browse files Browse the repository at this point in the history
  • Loading branch information
glemaitre committed Dec 28, 2023
1 parent ffba4d9 commit d888f7d
Show file tree
Hide file tree
Showing 11 changed files with 56 additions and 391 deletions.
4 changes: 2 additions & 2 deletions app/configuration/default.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Retriever parameters
SEMANTIC_RETRIEVER_PATH = "../models/api_semantic_retrieval.joblib"
SEMANTIC_RETRIEVER_PATH = "../models/user_guide_semantic_retrieval.joblib"
SEMANTIC_TOP_K = 5
LEXICAL_RETRIEVER_PATH = "../models/api_lexical_retrieval.joblib"
LEXICAL_RETRIEVER_PATH = "../models/user_guide_lexical_retrieval.joblib"
LEXICAL_TOP_K = 5
CROSS_ENCODER_PATH = "cross-encoder/ms-marco-MiniLM-L-6-v2"
CROSS_ENCODER_THRESHOLD = 2.0
Expand Down
2 changes: 1 addition & 1 deletion doc/references/scraping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ Scraping the documentation
:toctree: generated/
:template: class.rst

APIDocExtractor
APINumPyDocExtractor
UserGuideDocExtractor
18 changes: 10 additions & 8 deletions doc/user_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@ Scraping
The scraping module provides some simple estimator that extract meaningful
documentation from the documentation website.

:class:`~ragger_duck.scraping.APIDocExtractor` is a scraper that loads the
HTML pages and extract the documentation from the main API section. One can
provide a `chunk_size` and `chunk_overlap` to further split the documentation
sections into smaller chunks.
API documentation
-----------------

:class:`~ragger_duck.scraping.APINumPyDocExtractor` is a more advanced scraper
that uses `numpydoc` and it scraper to extract the documentation. Indeed, the
Expand All @@ -24,10 +22,14 @@ chunks of documentation from the parsed sections. While, we don't control for
the chunk size, the chunks are build such that they contain information only
of a specific parameter and always refer to the class or function. We hope that
scraping in such way can remove ambiguity that could exist when building chunks
without any control. Since we rely on `numpydoc` for the parsing that expect
a specific formatting, then
:class:`~ragger_duck.scraping.APINumPyDocExtractor` is much faster than
:class:`~ragger_duck.scraping.APIDocExtractor`.
without any control.

User Guide documentation
------------------------

:class:`~ragger_duck.scraping.UserGuideExtractor` is a scraper that extract
documentation from the user guide. It is a simple scraper that extract
text information from the webpage. Additionally, this text can be chunked.

Retriever
=========
Expand Down
7 changes: 3 additions & 4 deletions ragger_duck/prompt/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,15 @@ def __call__(self, query, **prompt_kwargs):
" queries"
)
prompt = (
"[INST] Rephrase the query to have correct wording in a context of "
"machine-learning. Make sure to have the right spelling. Finally, only "
"provide a list of keywords separated by commas.\n\n"
"[INST] Extract a list of keywords from the query below for a context"
" of machine-learning using scikit-learn.\n\n"
f"query: {query}[/INST]"
)

# do not create a stream generator
local_prompt_kwargs = prompt_kwargs.copy()
local_prompt_kwargs["stream"] = False
logger.info("Prompting to get keywords from the query")
logger.info(f"Prompting to get keywords from the query:\n{prompt}")
response = self.llm(prompt, **local_prompt_kwargs)
keywords = response["choices"][0]["text"].strip()
logger.info(f"Keywords: {keywords}")
Expand Down
14 changes: 2 additions & 12 deletions ragger_duck/scraping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,10 @@
website of scikit-learn.
"""

from ._api_doc import (
APIDocExtractor,
APINumPyDocExtractor,
extract_api_doc,
extract_api_doc_from_single_file,
)
from ._user_guide import (
UserGuideDocExtractor,
)
from ._api_doc import APINumPyDocExtractor
from ._user_guide import UserGuideDocExtractor

__all__ = [
"extract_api_doc",
"extract_api_doc_from_single_file",
"APIDocExtractor",
"APINumPyDocExtractor",
"UserGuideDocExtractor",
]
169 changes: 0 additions & 169 deletions ragger_duck/scraping/_api_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,9 @@
import inspect
import re
import warnings
from itertools import chain
from numbers import Integral
from pathlib import Path

from bs4 import BeautifulSoup
from joblib import Parallel, delayed
from langchain.text_splitter import RecursiveCharacterTextSplitter
from numpydoc.docscrape import NumpyDocString
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils._param_validation import Interval

from ._shared import _chunk_document, _extract_text_from_section

SKLEARN_API_URL = "https://scikit-learn.org/stable/modules/generated/"

Expand All @@ -36,68 +27,6 @@ def _api_path_to_api_url(path):
return SKLEARN_API_URL + path.name


def extract_api_doc_from_single_file(api_html_file):
"""Extract the text from the API documentation.
This function can process classes and functions.
Parameters
----------
api_html_file : :class:`pathlib.Path`
The path to the HTML API documentation.
Returns
-------
str
The text extracted from the API documentation.
"""
if not isinstance(api_html_file, Path):
raise ValueError(
f"The API HTML file should be a pathlib.Path object. Got {api_html_file!r}."
)
if api_html_file.suffix != ".html":
raise ValueError(
f"The file {api_html_file} is not an HTML file. Please provide an HTML "
"file."
)
with open(api_html_file, "r") as file:
soup = BeautifulSoup(file, "html.parser")
api_section = soup.section
return {
"source": _api_path_to_api_url(api_html_file),
"text": _extract_text_from_section(api_section),
}


def extract_api_doc(api_doc_folder, *, n_jobs=None):
"""Extract text from each HTML API file from a folder
Parameters
----------
api_doc_folder : :class:`pathlib.Path`
The path to the API documentation folder.
n_jobs : int, default=None
The number of jobs to run in parallel. If None, then the number of jobs is set
to the number of CPU cores.
Returns
-------
list
A list of dictionaries containing the source and text of the API
documentation.
"""
if not isinstance(api_doc_folder, Path):
raise ValueError(
"The API documentation folder should be a pathlib.Path object. Got "
f"{api_doc_folder!r}."
)
return Parallel(n_jobs=n_jobs)(
delayed(extract_api_doc_from_single_file)(api_html_file)
for api_html_file in api_doc_folder.glob("*.html")
)


def _extract_function_doc_numpydoc(function, import_name, html_source):
"""Extract documentation from a function using `numpydoc`.
Expand Down Expand Up @@ -240,104 +169,6 @@ def _extract_function_doc_numpydoc(function, import_name, html_source):
return extracted_doc


class APIDocExtractor(BaseEstimator, TransformerMixin):
"""Extract text from the API documentation.
This function can process classes and functions.
Parameters
----------
chunk_size : int or None, default=300
The size of the chunks to split the text into. If None, the text is not chunked.
chunk_overlap : int, default=50
The overlap between two consecutive chunks.
n_jobs : int, default=None
The number of jobs to run in parallel. If None, then the number of jobs is set
to the number of CPU cores.
Attributes
----------
text_splitter_ : :class:`langchain.text_splitter.RecursiveCharacterTextSplitter`
The text splitter to use to chunk the document. If `chunk_size` is None, this
attribute is None.
"""

_parameter_constraints = {
"chunk_size": [Interval(Integral, left=1, right=None, closed="left"), None],
"chunk_overlap": [Interval(Integral, left=0, right=None, closed="left")],
"n_jobs": [Integral, None],
}

def __init__(self, *, chunk_size=300, chunk_overlap=50, n_jobs=None):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.n_jobs = n_jobs

def fit(self, X=None, y=None):
"""No-op operation, only validate parameters.
Parameters
----------
X : None
This parameter is ignored.
y : None
This parameter is ignored.
Returns
-------
self
The fitted estimator.
"""
self._validate_params()
if self.chunk_size is not None:
self.text_splitter_ = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", " ", ""],
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
)
else:
self.text_splitter_ = None
return self

def transform(self, X):
"""Extract text from the API documentation.
Parameters
----------
X : :class:`pathlib.Path`
The path to the API documentation folder.
Returns
-------
output : list
A list of dictionaries containing the source and text of the API
documentation.
"""
if self.chunk_size is None:
output = extract_api_doc(X, n_jobs=self.n_jobs)
else:
output = list(
chain.from_iterable(
Parallel(n_jobs=self.n_jobs, return_as="generator")(
delayed(_chunk_document)(self.text_splitter_, document)
for document in extract_api_doc(X, n_jobs=self.n_jobs)
)
)
)
if not output:
raise ValueError(
"No API documentation was extracted. Please check the input folder."
)
return output

def _more_tags(self):
return {"X_types": ["string"], "stateless": True}


class APINumPyDocExtractor(BaseEstimator, TransformerMixin):
"""Extract text from the API documentation using `numpydoc`.
Expand Down
43 changes: 0 additions & 43 deletions ragger_duck/scraping/_shared.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,3 @@
import re

from bs4 import NavigableString


def _extract_text_from_section(section):
"""Extract the text from an HTML section.
Parameters
----------
section : :class:`bs4.element.Tag`
The HTML section from which to extract the text.
Returns
-------
str or None
The text extracted from the section. Return None if the section is a
:class:`bs4.NavigableString`.
Notes
-----
This function was copied from:
https://github.com/ray-project/llm-applications/blob/main/rag/data.py
(under CC BY 4.0 license)
"""
if isinstance(section, NavigableString):
return None
texts = []
for elem in section.children:
if isinstance(elem, NavigableString):
text = elem.strip()
else:
text = elem.get_text(" ")
# Remove line breaks within a paragraph
newline = re.compile(r"\n+")
text = newline.sub(" ", text)
# Remove the duplicated spaces on the fly
multiple_spaces = re.compile(r"\s+")
text = multiple_spaces.sub(" ", text)
texts.append(text)
return " ".join(texts).replace("¶", "\n")


def _chunk_document(text_splitter, document):
"""Chunk a document into smaller pieces.
Expand Down
Loading

0 comments on commit d888f7d

Please sign in to comment.