iter

probabl-ai · Dec 28, 2023 · d888f7d · d888f7d
1 parent ffba4d9
commit d888f7d
Show file tree

Hide file tree

Showing 11 changed files with 56 additions and 391 deletions.
diff --git a/app/configuration/default.py b/app/configuration/default.py
@@ -1,7 +1,7 @@
 # Retriever parameters
-SEMANTIC_RETRIEVER_PATH = "../models/api_semantic_retrieval.joblib"
+SEMANTIC_RETRIEVER_PATH = "../models/user_guide_semantic_retrieval.joblib"
 SEMANTIC_TOP_K = 5
-LEXICAL_RETRIEVER_PATH = "../models/api_lexical_retrieval.joblib"
+LEXICAL_RETRIEVER_PATH = "../models/user_guide_lexical_retrieval.joblib"
 LEXICAL_TOP_K = 5
 CROSS_ENCODER_PATH = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 CROSS_ENCODER_THRESHOLD = 2.0

diff --git a/doc/references/scraping.rst b/doc/references/scraping.rst
@@ -13,5 +13,5 @@ Scraping the documentation
    :toctree: generated/
    :template: class.rst
 
-   APIDocExtractor
    APINumPyDocExtractor
+   UserGuideDocExtractor
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -12,10 +12,8 @@ Scraping
 The scraping module provides some simple estimator that extract meaningful
 documentation from the documentation website.
 
-:class:`~ragger_duck.scraping.APIDocExtractor` is a scraper that loads the
-HTML pages and extract the documentation from the main API section. One can
-provide a `chunk_size` and `chunk_overlap` to further split the documentation
-sections into smaller chunks.
+API documentation
+-----------------
 
 :class:`~ragger_duck.scraping.APINumPyDocExtractor` is a more advanced scraper
 that uses `numpydoc` and it scraper to extract the documentation. Indeed, the
@@ -24,10 +22,14 @@ chunks of documentation from the parsed sections. While, we don't control for
 the chunk size, the chunks are build such that they contain information only
 of a specific parameter and always refer to the class or function. We hope that
 scraping in such way can remove ambiguity that could exist when building chunks
-without any control. Since we rely on `numpydoc` for the parsing that expect
-a specific formatting, then
-:class:`~ragger_duck.scraping.APINumPyDocExtractor` is much faster than
-:class:`~ragger_duck.scraping.APIDocExtractor`.
+without any control.
+
+User Guide documentation
+------------------------
+
+:class:`~ragger_duck.scraping.UserGuideExtractor` is a scraper that extract
+documentation from the user guide. It is a simple scraper that extract
+text information from the webpage. Additionally, this text can be chunked.
 
 Retriever
 =========

diff --git a/ragger_duck/prompt/_api.py b/ragger_duck/prompt/_api.py
@@ -50,16 +50,15 @@ def __call__(self, query, **prompt_kwargs):
                 " queries"
             )
             prompt = (
-                "[INST] Rephrase the query to have correct wording in a context of "
-                "machine-learning. Make sure to have the right spelling. Finally, only "
-                "provide a list of keywords separated by commas.\n\n"
+                "[INST] Extract a list of keywords from the query below for a context"
+                " of machine-learning using scikit-learn.\n\n"
                 f"query: {query}[/INST]"
             )
 
             # do not create a stream generator
             local_prompt_kwargs = prompt_kwargs.copy()
             local_prompt_kwargs["stream"] = False
-            logger.info("Prompting to get keywords from the query")
+            logger.info(f"Prompting to get keywords from the query:\n{prompt}")
             response = self.llm(prompt, **local_prompt_kwargs)
             keywords = response["choices"][0]["text"].strip()
             logger.info(f"Keywords: {keywords}")

diff --git a/ragger_duck/scraping/__init__.py b/ragger_duck/scraping/__init__.py
@@ -3,20 +3,10 @@
 website of scikit-learn.
 """
 
-from ._api_doc import (
-    APIDocExtractor,
-    APINumPyDocExtractor,
-    extract_api_doc,
-    extract_api_doc_from_single_file,
-)
-from ._user_guide import (
-    UserGuideDocExtractor,
-)
+from ._api_doc import APINumPyDocExtractor
+from ._user_guide import UserGuideDocExtractor
 
 __all__ = [
-    "extract_api_doc",
-    "extract_api_doc_from_single_file",
-    "APIDocExtractor",
     "APINumPyDocExtractor",
     "UserGuideDocExtractor",
 ]
diff --git a/ragger_duck/scraping/_api_doc.py b/ragger_duck/scraping/_api_doc.py
@@ -4,18 +4,9 @@
 import inspect
 import re
 import warnings
-from itertools import chain
-from numbers import Integral
-from pathlib import Path
 
-from bs4 import BeautifulSoup
-from joblib import Parallel, delayed
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from numpydoc.docscrape import NumpyDocString
 from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils._param_validation import Interval
-
-from ._shared import _chunk_document, _extract_text_from_section
 
 SKLEARN_API_URL = "https://scikit-learn.org/stable/modules/generated/"
 
@@ -36,68 +27,6 @@ def _api_path_to_api_url(path):
     return SKLEARN_API_URL + path.name
 
 
-def extract_api_doc_from_single_file(api_html_file):
-    """Extract the text from the API documentation.
-
-    This function can process classes and functions.
-
-    Parameters
-    ----------
-    api_html_file : :class:`pathlib.Path`
-        The path to the HTML API documentation.
-
-    Returns
-    -------
-    str
-        The text extracted from the API documentation.
-    """
-    if not isinstance(api_html_file, Path):
-        raise ValueError(
-            f"The API HTML file should be a pathlib.Path object. Got {api_html_file!r}."
-        )
-    if api_html_file.suffix != ".html":
-        raise ValueError(
-            f"The file {api_html_file} is not an HTML file. Please provide an HTML "
-            "file."
-        )
-    with open(api_html_file, "r") as file:
-        soup = BeautifulSoup(file, "html.parser")
-    api_section = soup.section
-    return {
-        "source": _api_path_to_api_url(api_html_file),
-        "text": _extract_text_from_section(api_section),
-    }
-
-
-def extract_api_doc(api_doc_folder, *, n_jobs=None):
-    """Extract text from each HTML API file from a folder
-
-    Parameters
-    ----------
-    api_doc_folder : :class:`pathlib.Path`
-        The path to the API documentation folder.
-
-    n_jobs : int, default=None
-        The number of jobs to run in parallel. If None, then the number of jobs is set
-        to the number of CPU cores.
-
-    Returns
-    -------
-    list
-        A list of dictionaries containing the source and text of the API
-        documentation.
-    """
-    if not isinstance(api_doc_folder, Path):
-        raise ValueError(
-            "The API documentation folder should be a pathlib.Path object. Got "
-            f"{api_doc_folder!r}."
-        )
-    return Parallel(n_jobs=n_jobs)(
-        delayed(extract_api_doc_from_single_file)(api_html_file)
-        for api_html_file in api_doc_folder.glob("*.html")
-    )
-
-
 def _extract_function_doc_numpydoc(function, import_name, html_source):
     """Extract documentation from a function using `numpydoc`.
 
@@ -240,104 +169,6 @@ def _extract_function_doc_numpydoc(function, import_name, html_source):
     return extracted_doc
 
 
-class APIDocExtractor(BaseEstimator, TransformerMixin):
-    """Extract text from the API documentation.
-
-    This function can process classes and functions.
-
-    Parameters
-    ----------
-    chunk_size : int or None, default=300
-        The size of the chunks to split the text into. If None, the text is not chunked.
-
-    chunk_overlap : int, default=50
-        The overlap between two consecutive chunks.
-
-    n_jobs : int, default=None
-        The number of jobs to run in parallel. If None, then the number of jobs is set
-        to the number of CPU cores.
-
-    Attributes
-    ----------
-    text_splitter_ : :class:`langchain.text_splitter.RecursiveCharacterTextSplitter`
-        The text splitter to use to chunk the document. If `chunk_size` is None, this
-        attribute is None.
-    """
-
-    _parameter_constraints = {
-        "chunk_size": [Interval(Integral, left=1, right=None, closed="left"), None],
-        "chunk_overlap": [Interval(Integral, left=0, right=None, closed="left")],
-        "n_jobs": [Integral, None],
-    }
-
-    def __init__(self, *, chunk_size=300, chunk_overlap=50, n_jobs=None):
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
-        self.n_jobs = n_jobs
-
-    def fit(self, X=None, y=None):
-        """No-op operation, only validate parameters.
-
-        Parameters
-        ----------
-        X : None
-            This parameter is ignored.
-
-        y : None
-            This parameter is ignored.
-
-        Returns
-        -------
-        self
-            The fitted estimator.
-        """
-        self._validate_params()
-        if self.chunk_size is not None:
-            self.text_splitter_ = RecursiveCharacterTextSplitter(
-                separators=["\n\n", "\n", " ", ""],
-                chunk_size=self.chunk_size,
-                chunk_overlap=self.chunk_overlap,
-                length_function=len,
-            )
-        else:
-            self.text_splitter_ = None
-        return self
-
-    def transform(self, X):
-        """Extract text from the API documentation.
-
-        Parameters
-        ----------
-        X : :class:`pathlib.Path`
-            The path to the API documentation folder.
-
-        Returns
-        -------
-        output : list
-            A list of dictionaries containing the source and text of the API
-            documentation.
-        """
-        if self.chunk_size is None:
-            output = extract_api_doc(X, n_jobs=self.n_jobs)
-        else:
-            output = list(
-                chain.from_iterable(
-                    Parallel(n_jobs=self.n_jobs, return_as="generator")(
-                        delayed(_chunk_document)(self.text_splitter_, document)
-                        for document in extract_api_doc(X, n_jobs=self.n_jobs)
-                    )
-                )
-            )
-        if not output:
-            raise ValueError(
-                "No API documentation was extracted. Please check the input folder."
-            )
-        return output
-
-    def _more_tags(self):
-        return {"X_types": ["string"], "stateless": True}
-
-
 class APINumPyDocExtractor(BaseEstimator, TransformerMixin):
     """Extract text from the API documentation using `numpydoc`.
 

diff --git a/ragger_duck/scraping/_shared.py b/ragger_duck/scraping/_shared.py
@@ -1,46 +1,3 @@
-import re
-
-from bs4 import NavigableString
-
-
-def _extract_text_from_section(section):
-    """Extract the text from an HTML section.
-
-    Parameters
-    ----------
-    section : :class:`bs4.element.Tag`
-        The HTML section from which to extract the text.
-
-    Returns
-    -------
-    str or None
-        The text extracted from the section. Return None if the section is a
-        :class:`bs4.NavigableString`.
-
-    Notes
-    -----
-    This function was copied from:
-    https://github.com/ray-project/llm-applications/blob/main/rag/data.py
-    (under CC BY 4.0 license)
-    """
-    if isinstance(section, NavigableString):
-        return None
-    texts = []
-    for elem in section.children:
-        if isinstance(elem, NavigableString):
-            text = elem.strip()
-        else:
-            text = elem.get_text(" ")
-        # Remove line breaks within a paragraph
-        newline = re.compile(r"\n+")
-        text = newline.sub(" ", text)
-        # Remove the duplicated spaces on the fly
-        multiple_spaces = re.compile(r"\s+")
-        text = multiple_spaces.sub(" ", text)
-        texts.append(text)
-    return " ".join(texts).replace("¶", "\n")
-
-
 def _chunk_document(text_splitter, document):
     """Chunk a document into smaller pieces.