iter

probabl-ai · Dec 15, 2023 · f142a44 · f142a44
1 parent d649679
commit f142a44
Show file tree

Hide file tree

Showing 51 changed files with 80 additions and 110 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -21,4 +21,4 @@ jobs:
           python -m pip install --upgrade pip
           pip install -r requirements.txt
       - name: Run tests
-        run: pytest -vsl --cov=rag_based_llm --cov-report term-missing rag_based_llm
+        run: pytest -vsl --cov=rag --cov-report term-missing rag
diff --git a/README.md b/README.md
@@ -1,32 +1,3 @@
-# RAG-based LLM for scikit-learn documentation
+# RAG for scikit-learn documentation
 
-This is a all components to build a RAG-based LLM for the scikit-learn documentation.
-
-## Installation
-
-```bash
-pip install -r requirements.txt
-```
-
-## Starting the server
-
-```bash
-cd app
-make start
-```
-
-The server can be access to:
-
-```bash
-http://localhost:8123
-```
-
-## RAG-based LLM
-
-We can represent a RAG-based LLM as follow [1]:
-
-![RAG diagram](doc/img/static/rag_pipeline.png)
-
-## References
-
-[1] https://vinija.ai/nlp/RAG/
+This is a all components to build a RAG for the scikit-learn documentation.
diff --git a/app/main.py b/app/main.py
@@ -18,8 +18,8 @@
 from sentence_transformers import CrossEncoder
 
 sys.path.append(str(Path(__file__).parent.parent))
-from rag_based_llm.prompt import QueryAgent
-from rag_based_llm.retrieval import RetrieverReranker
+from rag.prompt import QueryAgent
+from rag.retrieval import RetrieverReranker
 
 DEFAULT_PORT = 8123
 

diff --git a/doc/conf.py b/doc/conf.py
@@ -11,7 +11,7 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = "rag_based_llm"
+project = "rag"
 copyright = "2023, G. Lemaitre"
 author = "G. Lemaitre"
 release = "0.0.1.dev0"
@@ -40,10 +40,10 @@
 
 html_theme = "pydata_sphinx_theme"
 html_static_path = ["_static"]
-html_style = "css/rag_based_llm.css"
+html_style = "css/rag.css"
 html_logo = "_static/img/logo.png"
 html_css_files = [
-    "css/rag_based_llm.css",
+    "css/rag.css",
 ]
 html_sidebars = {
     "changelog": [],

diff --git a/doc/index.rst b/doc/index.rst
@@ -28,7 +28,7 @@ about the scikit-learn library.
                 <div class="card-body flex-fill">
                     <i class="fas fa-download fa-7x"></i>
                     <h5 class="card-title">Getting started</h5>
-                    <p class="card-text">Check out the getting started guides to install <em>rag_based_llm</em>.
+                    <p class="card-text">Check out the getting started guides to install <em>rag</em>.
                     Some extra information to get started with a new contribution is also provided.</p>
 
 .. container:: custom-button

diff --git a/doc/install.rst b/doc/install.rst
@@ -9,7 +9,7 @@ package. An easy and dirty way is to add the package into your path for the mome
 
   import sys
 
-  path_to_package = "/path/to/rag_based_llm"
+  path_to_package = "/path/to/rag"
   sys.path.append(path_to_package)
 
 You can check the file `requirements.txt` for the required packages. We also provide

diff --git a/doc/references/embedding.rst b/doc/references/embedding.rst
@@ -3,11 +3,11 @@
 Embedding
 =========
 
-.. automodule:: rag_based_llm.embedding
+.. automodule:: rag.embedding
    :no-members:
    :no-inherited-members:
 
-.. currentmodule:: rag_based_llm.embedding
+.. currentmodule:: rag.embedding
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/references/index.rst b/doc/references/index.rst
@@ -4,7 +4,7 @@
 API reference
 #############
 
-This is the full API documentation of the `rag_based_llm` package.
+This is the full API documentation of the `rag` package.
 
 .. toctree::
    :maxdepth: 2

diff --git a/doc/references/prompt.rst b/doc/references/prompt.rst
@@ -3,11 +3,11 @@
 Prompt
 ======
 
-.. automodule:: rag_based_llm.prompt
+.. automodule:: rag.prompt
    :no-members:
    :no-inherited-members:
 
-.. currentmodule:: rag_based_llm.prompt
+.. currentmodule:: rag.prompt
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/references/retrieval.rst b/doc/references/retrieval.rst
@@ -3,11 +3,11 @@
 Retrieval
 =========
 
-.. automodule:: rag_based_llm.retrieval
+.. automodule:: rag.retrieval
    :no-members:
    :no-inherited-members:
 
-.. currentmodule:: rag_based_llm.retrieval
+.. currentmodule:: rag.retrieval
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/references/scraping.rst b/doc/references/scraping.rst
@@ -3,14 +3,14 @@
 Scraping the documentation
 ==========================
 
-.. automodule:: rag_based_llm.scraping
+.. automodule:: rag.scraping
    :no-members:
    :no-inherited-members:
 
-.. currentmodule:: rag_based_llm.scraping
+.. currentmodule:: rag.scraping
 
 .. autosummary::
    :toctree: generated/
    :template: class.rst
 
-   APIDocExtractor
+   APIDocExtractor
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -1,4 +1,4 @@
-.. currentmodule:: rag_based_llm
+.. currentmodule:: rag
 
 ===============
 Release history

diff --git a/rag/__init__.py b/rag/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.1.dev0"
diff --git a/rag/embedding/__init__.py b/rag/embedding/__init__.py
@@ -0,0 +1,7 @@
+"""The module :mod:`rag.embedding` contains functions to embed
+transformers allowing to embed text.
+"""
+
+from ._sentence_transformer import SentenceTransformer
+
+__all__ = ["SentenceTransformer"]
diff --git a/...ed_llm/embedding/_sentence_transformer.py → rag/embedding/_sentence_transformer.py b/...ed_llm/embedding/_sentence_transformer.py → rag/embedding/_sentence_transformer.py
diff --git a/rag_based_llm/embedding/tests/__init__.py → rag/embedding/tests/__init__.py b/rag_based_llm/embedding/tests/__init__.py → rag/embedding/tests/__init__.py
diff --git a/...ased_llm/embedding/tests/data/__init__.py → rag/embedding/tests/data/__init__.py b/...ased_llm/embedding/tests/data/__init__.py → rag/embedding/tests/data/__init__.py
diff --git a/...ase-albert-small-v2/1_Pooling/config.json → ...ase-albert-small-v2/1_Pooling/config.json b/...ase-albert-small-v2/1_Pooling/config.json → ...ase-albert-small-v2/1_Pooling/config.json
@@ -4,4 +4,4 @@
   "pooling_mode_mean_tokens": true,
   "pooling_mode_max_tokens": false,
   "pooling_mode_mean_sqrt_len_tokens": false
-}
+}
diff --git a/...mers_paraphrase-albert-small-v2/README.md → ...mers_paraphrase-albert-small-v2/README.md b/...mers_paraphrase-albert-small-v2/README.md → ...mers_paraphrase-albert-small-v2/README.md
@@ -99,17 +99,17 @@ For an automated evaluation of this model, see the *Sentence Embeddings Benchmar
 ## Full Model Architecture
 ```
 SentenceTransformer(
-  (0): Transformer({'max_seq_length': 100, 'do_lower_case': False}) with Transformer model: AlbertModel 
+  (0): Transformer({'max_seq_length': 100, 'do_lower_case': False}) with Transformer model: AlbertModel
   (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
 )
 ```
 
 ## Citing & Authors
 
-This model was trained by [sentence-transformers](https://www.sbert.net/). 
-        
+This model was trained by [sentence-transformers](https://www.sbert.net/).
+
 If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084):
-```bibtex 
+```bibtex
 @inproceedings{reimers-2019-sentence-bert,
     title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
     author = "Reimers, Nils and Gurevych, Iryna",
@@ -119,4 +119,4 @@ If you find this model helpful, feel free to cite our publication [Sentence-BERT
     publisher = "Association for Computational Linguistics",
     url = "http://arxiv.org/abs/1908.10084",
 }
-```
+```
diff --git a/...rs_paraphrase-albert-small-v2/config.json → ...rs_paraphrase-albert-small-v2/config.json b/...rs_paraphrase-albert-small-v2/config.json → ...rs_paraphrase-albert-small-v2/config.json
diff --git a/...mall-v2/config_sentence_transformers.json → ...mall-v2/config_sentence_transformers.json b/...mall-v2/config_sentence_transformers.json → ...mall-v2/config_sentence_transformers.json
@@ -4,4 +4,4 @@
     "transformers": "4.7.0",
     "pytorch": "1.9.0+cu102"
   }
-}
+}
diff --git a/...s_paraphrase-albert-small-v2/modules.json → ...s_paraphrase-albert-small-v2/modules.json b/...s_paraphrase-albert-small-v2/modules.json → ...s_paraphrase-albert-small-v2/modules.json
@@ -11,4 +11,4 @@
     "path": "1_Pooling",
     "type": "sentence_transformers.models.Pooling"
   }
-]
+]
diff --git a/...aphrase-albert-small-v2/pytorch_model.bin → ...aphrase-albert-small-v2/pytorch_model.bin b/...aphrase-albert-small-v2/pytorch_model.bin → ...aphrase-albert-small-v2/pytorch_model.bin
diff --git a/...albert-small-v2/sentence_bert_config.json → ...albert-small-v2/sentence_bert_config.json b/...albert-small-v2/sentence_bert_config.json → ...albert-small-v2/sentence_bert_config.json
@@ -1,4 +1,4 @@
 {
   "max_seq_length": 100,
   "do_lower_case": false
-}
+}
diff --git a/...e-albert-small-v2/special_tokens_map.json → ...e-albert-small-v2/special_tokens_map.json b/...e-albert-small-v2/special_tokens_map.json → ...e-albert-small-v2/special_tokens_map.json
@@ -1 +1 @@
-{"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
+{"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
diff --git a/...s_paraphrase-albert-small-v2/spiece.model → ...s_paraphrase-albert-small-v2/spiece.model b/...s_paraphrase-albert-small-v2/spiece.model → ...s_paraphrase-albert-small-v2/spiece.model
diff --git a/...paraphrase-albert-small-v2/tokenizer.json → ...paraphrase-albert-small-v2/tokenizer.json b/...paraphrase-albert-small-v2/tokenizer.json → ...paraphrase-albert-small-v2/tokenizer.json
diff --git a/...ase-albert-small-v2/tokenizer_config.json → ...ase-albert-small-v2/tokenizer_config.json b/...ase-albert-small-v2/tokenizer_config.json → ...ase-albert-small-v2/tokenizer_config.json
@@ -1 +1 @@
-{"do_lower_case": true, "remove_space": true, "keep_accents": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "old_models/paraphrase-albert-small-v2/0_Transformer"}
+{"do_lower_case": true, "remove_space": true, "keep_accents": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "old_models/paraphrase-albert-small-v2/0_Transformer"}
diff --git a/...edding/tests/test_sentence_transformer.py → ...edding/tests/test_sentence_transformer.py b/...edding/tests/test_sentence_transformer.py → ...edding/tests/test_sentence_transformer.py
@@ -3,11 +3,12 @@
 
 import pytest
 
-from rag_based_llm.embedding import SentenceTransformer
+from rag.embedding import SentenceTransformer
 
 
 @pytest.mark.parametrize(
-    "input_texts", [
+    "input_texts",
+    [
         [
             {"source": "source 1", "text": "hello world"},
             {"source": "source 2", "text": "hello world"},

diff --git a/rag_based_llm/prompt/__init__.py → rag/prompt/__init__.py b/rag_based_llm/prompt/__init__.py → rag/prompt/__init__.py
@@ -1,3 +1,3 @@
 from ._agent import QueryAgent
 
-__all__ = ["QueryAgent"]
+__all__ = ["QueryAgent"]
diff --git a/rag_based_llm/prompt/_agent.py → rag/prompt/_agent.py b/rag_based_llm/prompt/_agent.py → rag/prompt/_agent.py
diff --git a/rag_based_llm/prompt/tests/__init__.py → rag/prompt/tests/__init__.py b/rag_based_llm/prompt/tests/__init__.py → rag/prompt/tests/__init__.py
diff --git a/rag_based_llm/retrieval/__init__.py → rag/retrieval/__init__.py b/rag_based_llm/retrieval/__init__.py → rag/retrieval/__init__.py
@@ -1,5 +1,5 @@
 from ._lexical import BM25Retriever
-from ._semantic import SemanticRetriever
 from ._reranking import RetrieverReranker
+from ._semantic import SemanticRetriever
 
 __all__ = ["BM25Retriever", "RetrieverReranker", "SemanticRetriever"]
diff --git a/rag_based_llm/retrieval/_lexical.py → rag/retrieval/_lexical.py b/rag_based_llm/retrieval/_lexical.py → rag/retrieval/_lexical.py
@@ -109,7 +109,7 @@ def query(self, query):
             )
         )
         scores = (idf * numerator / denominator).sum(axis=1)
-        indices = scores.argsort()[::-1][:self.top_k]
+        indices = scores.argsort()[::-1][: self.top_k]
         if isinstance(self.X_fit_[0], dict):
             return [
                 {

diff --git a/rag_based_llm/retrieval/_reranking.py → rag/retrieval/_reranking.py b/rag_based_llm/retrieval/_reranking.py → rag/retrieval/_reranking.py
diff --git a/rag_based_llm/retrieval/_semantic.py → rag/retrieval/_semantic.py b/rag_based_llm/retrieval/_semantic.py → rag/retrieval/_semantic.py
diff --git a/rag_based_llm/retrieval/tests/__init__.py → rag/retrieval/tests/__init__.py b/rag_based_llm/retrieval/tests/__init__.py → rag/retrieval/tests/__init__.py
diff --git a/...based_llm/retrieval/tests/test_lexical.py → rag/retrieval/tests/test_lexical.py b/...based_llm/retrieval/tests/test_lexical.py → rag/retrieval/tests/test_lexical.py
@@ -1,7 +1,7 @@
 import pytest
 from sklearn.feature_extraction.text import CountVectorizer
 
-from rag_based_llm.retrieval import BM25Retriever
+from rag.retrieval import BM25Retriever
 
 
 @pytest.mark.parametrize(
@@ -20,9 +20,7 @@
 @pytest.mark.parametrize("count_vectorizer", [None, CountVectorizer()])
 def test_lexical_retriever(input_texts, output, count_vectorizer):
     """Check that the SemanticRetriever wrapper works as expected"""
-    bm25 = BM25Retriever(count_vectorizer=count_vectorizer, top_k=1).fit(
-        input_texts
-    )
+    bm25 = BM25Retriever(count_vectorizer=count_vectorizer, top_k=1).fit(input_texts)
     assert bm25.query("xxx") == output
 
 

diff --git a/...sed_llm/retrieval/tests/test_reranking.py → rag/retrieval/tests/test_reranking.py b/...sed_llm/retrieval/tests/test_reranking.py → rag/retrieval/tests/test_reranking.py
@@ -3,8 +3,8 @@
 import pytest
 from sentence_transformers import CrossEncoder
 
-from rag_based_llm.embedding import SentenceTransformer
-from rag_based_llm.retrieval import BM25Retriever, RetrieverReranker, SemanticRetriever
+from rag.embedding import SentenceTransformer
+from rag.retrieval import BM25Retriever, RetrieverReranker, SemanticRetriever
 
 
 @pytest.mark.parametrize(

diff --git a/...ased_llm/retrieval/tests/test_semantic.py → rag/retrieval/tests/test_semantic.py b/...ased_llm/retrieval/tests/test_semantic.py → rag/retrieval/tests/test_semantic.py
@@ -2,8 +2,8 @@
 
 import pytest
 
-from rag_based_llm.embedding import SentenceTransformer
-from rag_based_llm.retrieval import SemanticRetriever
+from rag.embedding import SentenceTransformer
+from rag.retrieval import SemanticRetriever
 
 
 @pytest.mark.parametrize(
@@ -52,4 +52,4 @@ def test_semantic_retriever_error():
     input_texts = [{"source": "source 1", "text": "xxx"}]
     faiss = SemanticRetriever(embedding=embedder, top_k=1).fit(input_texts)
     with pytest.raises(TypeError):
-        faiss.query(["xxxx"])
+        faiss.query(["xxxx"])
diff --git a/rag_based_llm/scraping/__init__.py → rag/scraping/__init__.py b/rag_based_llm/scraping/__init__.py → rag/scraping/__init__.py
@@ -1,15 +1,15 @@
-"""The module :mod:`rag_based_llm.scraping` contains functions to scrape
+"""The module :mod:`rag.scraping` contains functions to scrape
 the documentation website of scikit-learn.
 """
 
 from ._api_doc import (
+    APIDocExtractor,
     extract_api_doc,
     extract_api_doc_from_single_file,
-    APIDocExtractor,
 )
 
 __all__ = [
     "extract_api_doc",
     "extract_api_doc_from_single_file",
     "APIDocExtractor",
-]
+]
diff --git a/rag_based_llm/scraping/_api_doc.py → rag/scraping/_api_doc.py b/rag_based_llm/scraping/_api_doc.py → rag/scraping/_api_doc.py
diff --git a/rag_based_llm/scraping/tests/__init__.py → rag/scraping/tests/__init__.py b/rag_based_llm/scraping/tests/__init__.py → rag/scraping/tests/__init__.py
diff --git a/...based_llm/scraping/tests/data/__init__.py → rag/scraping/tests/data/__init__.py b/...based_llm/scraping/tests/data/__init__.py → rag/scraping/tests/data/__init__.py
diff --git a/...m/scraping/tests/data/api_doc/__init__.py → rag/scraping/tests/data/api_doc/__init__.py b/...m/scraping/tests/data/api_doc/__init__.py → rag/scraping/tests/data/api_doc/__init__.py
diff --git a/...a/api_doc/sklearn.base.BaseEstimator.html → ...a/api_doc/sklearn.base.BaseEstimator.html b/...a/api_doc/sklearn.base.BaseEstimator.html → ...a/api_doc/sklearn.base.BaseEstimator.html
@@ -18,14 +18,14 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
-  
+
   <title>sklearn.base.BaseEstimator &mdash; scikit-learn 1.4.dev0 documentation</title>
-  
+
   <link rel="canonical" href="http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html" />
 
-  
+
   <link rel="shortcut icon" href="../../_static/favicon.ico"/>
-  
+
 
   <link rel="stylesheet" href="../../_static/css/vendor/bootstrap.min.css" type="text/css" />
   <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
@@ -39,7 +39,7 @@
   <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
 <script id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
 <script src="../../_static/js/vendor/jquery-3.6.3.slim.min.js"></script>
-<script src="../../_static/js/details-permalink.js"></script> 
+<script src="../../_static/js/details-permalink.js"></script>
 </head>
 <body>
 
@@ -192,7 +192,7 @@
     </div>
     <div id="sk-page-content-wrapper">
       <div class="sk-page-content container-fluid body px-md-3" role="main">
-        
+
   <section id="sklearn-base-baseestimator">
 <h1><a class="reference internal" href="../classes.html#module-sklearn.base" title="sklearn.base"><code class="xref py py-mod docutils literal notranslate"><span class="pre">sklearn.base</span></code></a>.BaseEstimator<a class="headerlink" href="#sklearn-base-baseestimator" title="Link to this heading">¶</a></h1>
 <dl class="py class">
@@ -360,8 +360,8 @@ <h2>Examples using <code class="docutils literal notranslate"><span class="pre">
 });
 
 </script>
-    
+
 <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"></script>
-    
+
 </body>
-</html>
+</html>