From 53f502a3588038f4e3ecb7b38a02035eefbac0e4 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 28 Jun 2024 18:30:47 +0200
Subject: [PATCH] DOC add more details regarding the API documentation scrapper

---
 examples/plot_documentation_scraping.py | 28 +++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/examples/plot_documentation_scraping.py b/examples/plot_documentation_scraping.py
index de00fdd..c0f2660 100644
--- a/examples/plot_documentation_scraping.py
+++ b/examples/plot_documentation_scraping.py
@@ -27,9 +27,29 @@
 chunks = APINumPyDocExtractor().fit_transform(path_api_doc)
 
 # %%
-for chunk in chunks:
-    print(f"The source of the chunk is {chunk['source']}\n")
-    print(f"{chunk['text']}\n")
+# The chunks are stored in a list of dictionaries.
+print(f"Chunks is {type(chunks)}")
+print(f"A chunk is {type(chunks[0])}")
 
 # %%
-print("hello world")
+# A chunk contains 2 keys: `"source"` that is the HTML source page and `"text"` that is
+# the extracted text.
+chunks[0].keys()
+
+# %%
+# For the API documentation, we use `numpydoc` to generate meaningful chunks. For
+# instance, this is the first chunk of text.
+print(chunks[0]["text"])
+
+# %%
+# The first line of the chunk corresponds to the estimator or class name and its
+# module. This information is useful to disambiguate the documentation when using an
+# LLM: sometimes we can have multiple parameters name defined in different classes or
+# functions. An LLM will tend to summarize the information coming from the different
+# chunks. However, if we provide the class or function name and this information is
+# present in the user prompt, then the LLM is likely to generate a more accurate
+# answer.
+#
+# Since `numpydoc` offer a structured information based on the sections of the
+# docstring, we therefore use these sections and create hand-crafted chunks that we
+# find meaningful in regards to the API documentation.