Merge pull request #54 from KruxAI/reranking-fixes

Change re-ranker selection to 1 at a time
KruxAI · Oct 5, 2024 · 710c53a · 710c53a
2 parents 112e759 + f29f0de
commit 710c53a
Show file tree

Hide file tree

Showing 9 changed files with 98 additions and 367 deletions.
diff --git a/src/ragbuilder/data_processor.py b/src/ragbuilder/data_processor.py
@@ -68,7 +68,7 @@ def process_directory(self, dir_path: str) -> str:
 
         files = [f for f in Path(dir_path).rglob('*') if f.is_file() and f.name != '.DS_Store']
         args = [(str(f), f'{processed_dir}/{str(f.relative_to(dir_path))}.processed') for f in files]
-        print(args)
+        # print(args)
 
             # Use the Pool within the main guard
         with Pool() as pool:
@@ -90,7 +90,7 @@ def process_file(self, file_path: str, processed_file: str = None) -> str:
         if processed_dir != '':
             os.makedirs(processed_dir, exist_ok=True)
 
-        print(file_path, processed_file)
+        # print(file_path, processed_file)
         logger.info(f"Preprocessing file: {file_path} (this may take a while)...")
 
         # Read the original file and process it

diff --git a/src/ragbuilder/executor.py b/src/ragbuilder/executor.py
@@ -11,6 +11,7 @@
 import json
 import openai
 import optuna
+import math
 from optuna.storages import RDBStorage
 from optuna.trial import TrialState
 from tenacity import retry, stop_after_attempt, wait_random_exponential, before_sleep_log, retry_if_exception_type, retry_if_result
@@ -142,148 +143,6 @@ def get_model_obj(model_type: str, model: str, temperature: Optional[float] = No
     exec(code_str, None, locals_dict)
     return locals_dict[model_type]
 
-def rag_builder_bayes_optmization(**kwargs):
-    run_id=kwargs['run_id']
-    src_data=kwargs['src_data']
-    selected_templates = kwargs.get('selected_templates', [])
-    vectorDB=kwargs['vectorDB']
-    min_chunk_size=kwargs.get('min_chunk_size', 1000)
-    max_chunk_size=kwargs.get('max_chunk_size', 1000)
-    num_runs=kwargs.get('num_runs')
-    other_embedding=kwargs.get('other_embedding')
-    other_llm=kwargs.get('other_llm')
-    logger.info(f'other_embedding={other_embedding}')
-    logger.info(f'other_llm={other_llm}')
-    eval_framework=kwargs.get('eval_framework') # TODO: Add this as an argument to RagEvaluator
-    eval_embedding=kwargs.get('eval_embedding')
-    eval_llm=kwargs.get('eval_llm')
-    sota_embedding=kwargs.get('sota_embedding')
-    sota_llm=kwargs.get('sota_llm')
-    test_data=kwargs['test_data'] #loader_kwargs ={'source':'url','input_path': url1},
-    test_df=pd.read_csv(test_data)
-    test_ds = Dataset.from_pandas(test_df)
-    disabled_opts=kwargs['disabled_opts']
-    result=None
-    # Define the configuration space
-    logger.info(f"Initializing RAG parameter set...")
-    lc_templates.init(vectorDB, min_chunk_size, max_chunk_size, other_embedding, other_llm)
-    # configs_to_run=dict()
-    cnt_templates=0
-    # configs_to_run= {1:{'ragname':'simple_rag'},2:{'ragname':'semantic_chunker'},3:{'ragname':'hyde'},4:{'ragname':'hybrid_rag'},4:{'ragname':'crag'}} 
-    #TODO: Add a check to see if the templates are to be included
-
-
-    if kwargs['compare_templates']:
-        # configs_to_run.update(top_n_templates)
-        cnt_templates = len(selected_templates)
-
-    if kwargs['include_granular_combos']:
-        space = lc_templates.generate_config_space(exclude_elements=disabled_opts)
-        logger.info(f"Config space={space}")
-        cnt_combos=lc_templates.count_combos() + cnt_templates
-        logger.info(f"Number of RAG combinations : {cnt_combos}")
-        configs_evaluated=dict()
-
-        if cnt_combos < num_runs:
-            total_runs=cnt_combos
-        else:
-            total_runs = num_runs + cnt_templates
-    else:
-        logger.info(f"Number of RAG combinations : {cnt_templates}")
-        total_runs = cnt_templates
-
-    progress_state.set_total_runs(total_runs)
-
-    # Run Templates first if templates have been selected
-    # configs_to_run= {1:{'ragname':'simple_rag'}}
-    for key in selected_templates:
-        val = top_n_templates[key]
-        progress_state.increment_progress()
-        logger.info(f"Running: {progress_state.get_progress()['current_run']}/{progress_state.get_progress()['total_runs']}")
-        logger.info(f"SOTA template: {key}: {val['description']}")
-        # logger.info(f"Template:{key}: {val['description']}:{val['retrieval_model']}")
-        print(val)
-        val['loader_kwargs']=src_data
-        val['embedding_kwargs']={'embedding_model': sota_embedding}
-        val['llm']=sota_llm
-        val['run_id']=run_id
-        rag_builder=SOTARAGBuilder(val)
-        run_config=RunConfig(timeout=RUN_CONFIG_TIMEOUT, max_workers=RUN_CONFIG_MAX_WORKERS, max_wait=RUN_CONFIG_MAX_WAIT, max_retries=RUN_CONFIG_MAX_RETRIES)
-        # logger.info(f"{repr(run_config)}")
-        # time.sleep(30)
-        # result=0
-        logger.info(f"Evaluating RAG Config #{progress_state.get_progress()['current_run']}... (this may take a while)")
-        rageval=eval.RagEvaluator(
-            rag_builder, # code for rag function
-            test_ds, 
-            llm = get_model_obj('llm', eval_llm), 
-            embeddings = get_model_obj('embedding', eval_embedding), 
-            #TODO: Fetch Run Config settings from advanced settings from front-end
-            run_config = run_config,
-            is_async=RUN_CONFIG_IS_ASYNC
-            )
-        result=rageval.evaluate()
-        logger.debug(f'progress_state={progress_state.get_progress()}')
-
-    if kwargs['include_granular_combos']:
-        # Objective function for Bayesian optimization on the custom RAG configurations
-        @use_named_args(space)
-        def objective(**params):
-            config = lc_templates.generate_config_from_params(params)
-            str_config=json.dumps(config)
-            score = configs_evaluated.get(str_config, None)
-            if score:
-                logger.info(f"Config already evaluated with score: {score}: {config}")
-                return score
-
-            config['loader_kwargs'] = src_data
-            config['run_id'] = run_id
-            # logger.info(f"Config raw={config}\n\n")
-            # logger.info(f"Config={json.dumps(config, indent=4)}\n\n")
-
-            progress_state.increment_progress()
-            logger.info(f"Running: {progress_state.get_progress()['current_run']}/{progress_state.get_progress()['total_runs']}")
-            logger.info(f"Initializing RAG object...")
-            rag_builder = RagBuilder(config)
-            run_config = RunConfig(timeout=RUN_CONFIG_TIMEOUT, max_workers=RUN_CONFIG_MAX_WORKERS, max_wait=RUN_CONFIG_MAX_WAIT, max_retries=RUN_CONFIG_MAX_RETRIES)
-            # logger.info(f"{repr(run_config)}")
-
-            # Dummy run to test config structures
-            # scores=[0.5890422913, 0.7656478429, 0.5935820215, 0.6100727287, 0.7904418542, 0.8966577465, 0.6205320374, 0.5581511382, 0.5966923152, 0.6609632653, 0.550011964, 0.5402692061, 0.5755822793, 0.6234577837, 0.5905206211, 0.5864179955, 0.6062351971, 0.570672658, 0.7500015656, 0.7747984829, 0.7993104194, 0.781805689, 0.5710751929, 0.6645166332, 0.622714199, 0.6356301621, 0.6241188896, 0.8153687664, 0.6827077848, 0.6959527751, 0.8423843881, 0.9609655913, 0.6698080329, 0.5912493806, 0.7359742148, 0.7080427047, 0.6899119678, 0.6105474717, 0.7208188469, 0.695968622, 0.6869681458, 0.7269693914, 0.7424575424, 0.7011177759, 0.8697962711, 0.8088942748, 0.9005903531, 0.8688290896, 0.6666808804, 0.666883309, 0.6888392867, 0.7296173512, 0.6497820307, 0.9349375798, 0.6906564857, 0.7924750533, 0.8931411951, 0.9462395027, 0.881902146, 0.6423630407, 0.7474532458, 0.8388990762, 0.6705516029, 0.7747971947, 0.7218534451, 0.8823771379, 0.8505055572, 0.6567467535, 0.7043667001, 0.6939435603, 0.8808846607, 0.9005438973, 0.8691391629, 0.9763763024, 0.6278870244, 0.7355142518, 0.7633544088, 0.5913903849, 0.626892352, 0.6987860021, 0.6456495151, 0.7416265216, 0.6446452076, 0.7546382667, 0.800226133, 0.9454843785, 0.9280627528, 0.6740895569, 0.7741376011, 0.7247380601, 0.6472672733, 0.8251968841, 0.9085414624, 0.8238757897, 0.6880305725, 0.6632702383, 0.8470425157, 0.6590755791, 0.7576560761, 0.7567810953]
-            # try:
-            #     score = scores[int(time.time())%100]
-            # except:
-            #      score = -1
-            logger.info(f"Evaluating RAG Config #{progress_state.get_progress()['current_run']}... \n(this may take a while)")
-            rageval = eval.RagEvaluator(
-                rag_builder,
-                test_ds, 
-                llm = get_model_obj('llm', eval_llm), 
-                embeddings = get_model_obj('embedding', eval_embedding), 
-                run_config=run_config,
-                is_async=RUN_CONFIG_IS_ASYNC
-            )
-            # x=input("Continue? ")
-            # if x.lower() != 'y':
-            #      exit()
-            score = rageval.evaluate()
-            if not score:
-                logger.info(f"Completed evaluation. Adding to configs evaluated...")
-                configs_evaluated[str_config]=score
-            return -score  # We negate the score because gp_minimize minimizes
-
-        # Run Bayesian optimization
-        logger.info(f"Running Bayesian optimization...")
-        result = gp_minimize(objective, space, n_calls=num_runs, random_state=42) #, callback=DeltaXStopper(1e-8))
-        logger.info(f"Completed Bayesian optimization...")
-
-        best_params = result.x
-        best_score = -result.fun
-
-        logger.info(f"Best Configuration: {best_params}")
-        logger.info(f"Best Score: {best_score}")
-    return 0
-
 def rag_builder_bayes_optimization_optuna(**kwargs):
     run_id=kwargs['run_id']
     src_data=kwargs['src_data']
@@ -424,6 +283,12 @@ def objective(trial):
                 result = rageval.evaluate()
                 logger.info(f"Completed evaluation. result={result}...")
                 if 'answer_correctness' in result and result['answer_correctness'] != float('NaN'):
+                    logger.debug("Answer_correctness: ", result.scores["answer_correctness"])
+                    none_records = len(result.scores.filter(lambda x: math.isnan(x['answer_correctness']) if x['answer_correctness'] is not None else False))
+                    percent_none = (none_records * 1.0 / len(result.scores)) 
+                    if percent_none > 0.2:
+                        logger.warning(f"More than 20% of the records have 'answer_correctness' as None. Skipping this config...")
+                        return float('NaN')
                     if not progress_state.get_progress()['first_eval_complete']:
                         progress_state.set_first_eval_complete()
                     return result['answer_correctness'] 

diff --git a/src/ragbuilder/langchain_module/chunkingstrategy/langchain_chunking.py b/src/ragbuilder/langchain_module/chunkingstrategy/langchain_chunking.py
@@ -83,7 +83,10 @@ def getMarkdownHeaderTextSplitter(**kwargs):
     ("##", "Header 2"),
     ("###", "Header 3")]
 {splitter_name} = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
-splits={splitter_name}.split_text(docs[0].page_content)"""
+splits = []
+for doc in docs:
+    splits.extend({splitter_name}.split_text(doc.page_content))
+"""
         import_string = f"""from langchain.text_splitter import MarkdownHeaderTextSplitter"""
         return {'code_string':code_string,'import_string':import_string}
     except KeyError as e:

diff --git a/src/ragbuilder/langchain_module/retriever/retriever.py b/src/ragbuilder/langchain_module/retriever/retriever.py
@@ -57,7 +57,7 @@ def getRetriever(**kwargs):
             if any(reranker in document_compressor_pipeline for reranker in rerankers_to_check):
                 code_string = f"""retriever=c.as_retriever(search_type='{kwargs['search_type']}', search_kwargs={{'k': 100}})"""
             else:
-                print('No Rerankers')
+                logger.info('No Rerankers')
                 code_string = f"""retriever=c.as_retriever(search_type='{kwargs['search_type']}', search_kwargs={{'k': {kwargs['search_kwargs']}}})"""
         else:
             code_string = f"""retriever=c.as_retriever(search_type='{kwargs['search_type']}', search_kwargs={{'k': {kwargs['search_kwargs']}}})"""
@@ -91,7 +91,7 @@ def getRetriever(**kwargs):
         parent_kwargs["chunking_kwargs"]["chunk_overlap"] = kwargs["chunking_kwargs"]["chunk_overlap"]*3
         parent_kwargs['splitter_name']="parent_splitter"
         parent_chunk_strategy=getChunkingStrategy(**parent_kwargs)
-        print('parent_chunk_strategy',parent_chunk_strategy)
+        # print('parent_chunk_strategy',parent_chunk_strategy)
         code_string=f"""{parent_chunk_strategy['code_string']}"""
         code_string+="""
 store = InMemoryStore()
@@ -107,8 +107,7 @@ def getRetriever(**kwargs):
         import_string = f"""from langchain_community.retrievers import  BM25Retriever"""
         return {'code_string':code_string,'import_string':import_string}
     elif retriever_type == "colbertRetriever":
-        print("Colbert Retriever Invoked")
-        print("Colbert Retriever Invoked",str(search_kwargs))
+        logger.info("Colbert Retriever Invoked")
         timestamp = str(int(time.time()*1000+random.randint(1, 1000)))
         index_name = "testindex-ragbuilder-" + timestamp
         code_string= f"""
@@ -137,7 +136,7 @@ def getCompressors(**kwargs):
     - DocumentCompressorPipeline object based on the specified compressors.
     """
     compressor_config = kwargs.get('compressor',None)
-    print(compressor_config)
+    # print(compressor_config)
     search_kwargs=kwargs.get('search_kwargs',None)
     arr_transformer=[]
     if 'LLMChainExtractor' in compressor_config:
@@ -174,63 +173,64 @@ def getCompressors(**kwargs):
         return {'code_string':code_string,'import_string':import_string}
 
     if 'mixedbread-ai/mxbai-rerank-large-v1' in compressor_config:
-        code_string= f"""ranker = Reranker("mixedbread-ai/mxbai-rerank-large-v1", verbose=0)
+        code_string= f"""ranker = Reranker("mixedbread-ai/mxbai-rerank-large-v1", model_type='cross-encoder', verbose=0)
 compressor = ranker.as_langchain_compressor(k={search_kwargs})
 arr_comp.append(compressor)
 """
         import_string = f"""from rerankers import Reranker"""
         return {'code_string':code_string,'import_string':import_string}
 
     if 'flashrank' in compressor_config:
-        code_string= f"""ranker = Reranker("flashrank", verbose=0)
+        code_string= f"""ranker = Reranker("flashrank", model_type='FlashRankRanker', verbose=0)
 compressor = ranker.as_langchain_compressor(k={search_kwargs})
 arr_comp.append(compressor)
 """
         import_string = f"""from rerankers import Reranker"""
         return {'code_string':code_string,'import_string':import_string}
 
     if 'cohere' in compressor_config:
-        code_string= f"""ranker = Reranker("cohere", lang='en', api_key = os.getenv('COHERE_API_KEY'))
+        code_string= f"""ranker = Reranker("cohere", model_type='APIRanker', lang='en', api_key = os.getenv('COHERE_API_KEY'))
 compressor = ranker.as_langchain_compressor(k={search_kwargs})
 arr_comp.append(compressor)
 """
         import_string = f"""from rerankers import Reranker"""
         return {'code_string':code_string,'import_string':import_string}
 
     if 'jina' in compressor_config:
-        code_string= f"""ranker = Reranker("jina", api_key = os.getenv('JINA_API_KEY'))
+        code_string= f"""ranker = Reranker("jina", model_type='APIRanker', api_key = os.getenv('JINA_API_KEY'))
 compressor = ranker.as_langchain_compressor(k={search_kwargs})
 arr_comp.append(compressor)
 """
         import_string = f"""from rerankers import Reranker"""
         return {'code_string':code_string,'import_string':import_string}
 
     if 'colbert' in compressor_config:
-        code_string= f"""ranker = Reranker("colbert")
+        code_string= f"""ranker = Reranker("colbert", model_type='ColBERTRanker')
 compressor = ranker.as_langchain_compressor(k={search_kwargs})
 arr_comp.append(compressor)
 """
         import_string = f"""from rerankers import Reranker"""
         return {'code_string':code_string,'import_string':import_string}
 
     if 'mixedbread-ai/mxbai-rerank-base-v1' in compressor_config:
-        code_string= f"""ranker = Reranker("cross-encoder")
+        # code_string= f"""ranker = Reranker("cross-encoder")
+        code_string= f"""ranker = Reranker("mixedbread-ai/mxbai-rerank-base-v1", model_type='cross-encoder', verbose=1)
 compressor = ranker.as_langchain_compressor(k={search_kwargs})
 arr_comp.append(compressor)
 """
         import_string = f"""from rerankers import Reranker"""
         return {'code_string':code_string,'import_string':import_string}
 
     if 'rankllm' in compressor_config:
-        code_string= f"""ranker = Reranker("rankllm",api_key = os.getenv('OPENAI_API_KEY'))
+        code_string= f"""ranker = Reranker("rankllm", model_type='RankLLMRanker', api_key = os.getenv('OPENAI_API_KEY'))
 compressor = ranker.as_langchain_compressor(k={search_kwargs})
 arr_comp.append(compressor)
 """
         import_string = f"""from rerankers import Reranker"""
         return {'code_string':code_string,'import_string':import_string}
 
     if 'BAAI/bge-reranker-base' in compressor_config:
-        code_string= f"""ranker = Reranker("BAAI/bge-reranker-base")
+        code_string= f"""ranker = Reranker("BAAI/bge-reranker-base", model_type='APIRanker')
 compressor = ranker.as_langchain_compressor(k={search_kwargs})
 arr_comp.append(compressor)
 """