Skip to content

Commit

Permalink
Merge pull request #54 from KruxAI/reranking-fixes
Browse files Browse the repository at this point in the history
Change re-ranker selection to 1 at a time
  • Loading branch information
aravind10x authored Oct 5, 2024
2 parents 112e759 + f29f0de commit 710c53a
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 367 deletions.
4 changes: 2 additions & 2 deletions src/ragbuilder/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def process_directory(self, dir_path: str) -> str:

files = [f for f in Path(dir_path).rglob('*') if f.is_file() and f.name != '.DS_Store']
args = [(str(f), f'{processed_dir}/{str(f.relative_to(dir_path))}.processed') for f in files]
print(args)
# print(args)

# Use the Pool within the main guard
with Pool() as pool:
Expand All @@ -90,7 +90,7 @@ def process_file(self, file_path: str, processed_file: str = None) -> str:
if processed_dir != '':
os.makedirs(processed_dir, exist_ok=True)

print(file_path, processed_file)
# print(file_path, processed_file)
logger.info(f"Preprocessing file: {file_path} (this may take a while)...")

# Read the original file and process it
Expand Down
149 changes: 7 additions & 142 deletions src/ragbuilder/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import json
import openai
import optuna
import math
from optuna.storages import RDBStorage
from optuna.trial import TrialState
from tenacity import retry, stop_after_attempt, wait_random_exponential, before_sleep_log, retry_if_exception_type, retry_if_result
Expand Down Expand Up @@ -142,148 +143,6 @@ def get_model_obj(model_type: str, model: str, temperature: Optional[float] = No
exec(code_str, None, locals_dict)
return locals_dict[model_type]

def rag_builder_bayes_optmization(**kwargs):
run_id=kwargs['run_id']
src_data=kwargs['src_data']
selected_templates = kwargs.get('selected_templates', [])
vectorDB=kwargs['vectorDB']
min_chunk_size=kwargs.get('min_chunk_size', 1000)
max_chunk_size=kwargs.get('max_chunk_size', 1000)
num_runs=kwargs.get('num_runs')
other_embedding=kwargs.get('other_embedding')
other_llm=kwargs.get('other_llm')
logger.info(f'other_embedding={other_embedding}')
logger.info(f'other_llm={other_llm}')
eval_framework=kwargs.get('eval_framework') # TODO: Add this as an argument to RagEvaluator
eval_embedding=kwargs.get('eval_embedding')
eval_llm=kwargs.get('eval_llm')
sota_embedding=kwargs.get('sota_embedding')
sota_llm=kwargs.get('sota_llm')
test_data=kwargs['test_data'] #loader_kwargs ={'source':'url','input_path': url1},
test_df=pd.read_csv(test_data)
test_ds = Dataset.from_pandas(test_df)
disabled_opts=kwargs['disabled_opts']
result=None
# Define the configuration space
logger.info(f"Initializing RAG parameter set...")
lc_templates.init(vectorDB, min_chunk_size, max_chunk_size, other_embedding, other_llm)
# configs_to_run=dict()
cnt_templates=0
# configs_to_run= {1:{'ragname':'simple_rag'},2:{'ragname':'semantic_chunker'},3:{'ragname':'hyde'},4:{'ragname':'hybrid_rag'},4:{'ragname':'crag'}}
#TODO: Add a check to see if the templates are to be included


if kwargs['compare_templates']:
# configs_to_run.update(top_n_templates)
cnt_templates = len(selected_templates)

if kwargs['include_granular_combos']:
space = lc_templates.generate_config_space(exclude_elements=disabled_opts)
logger.info(f"Config space={space}")
cnt_combos=lc_templates.count_combos() + cnt_templates
logger.info(f"Number of RAG combinations : {cnt_combos}")
configs_evaluated=dict()

if cnt_combos < num_runs:
total_runs=cnt_combos
else:
total_runs = num_runs + cnt_templates
else:
logger.info(f"Number of RAG combinations : {cnt_templates}")
total_runs = cnt_templates

progress_state.set_total_runs(total_runs)

# Run Templates first if templates have been selected
# configs_to_run= {1:{'ragname':'simple_rag'}}
for key in selected_templates:
val = top_n_templates[key]
progress_state.increment_progress()
logger.info(f"Running: {progress_state.get_progress()['current_run']}/{progress_state.get_progress()['total_runs']}")
logger.info(f"SOTA template: {key}: {val['description']}")
# logger.info(f"Template:{key}: {val['description']}:{val['retrieval_model']}")
print(val)
val['loader_kwargs']=src_data
val['embedding_kwargs']={'embedding_model': sota_embedding}
val['llm']=sota_llm
val['run_id']=run_id
rag_builder=SOTARAGBuilder(val)
run_config=RunConfig(timeout=RUN_CONFIG_TIMEOUT, max_workers=RUN_CONFIG_MAX_WORKERS, max_wait=RUN_CONFIG_MAX_WAIT, max_retries=RUN_CONFIG_MAX_RETRIES)
# logger.info(f"{repr(run_config)}")
# time.sleep(30)
# result=0
logger.info(f"Evaluating RAG Config #{progress_state.get_progress()['current_run']}... (this may take a while)")
rageval=eval.RagEvaluator(
rag_builder, # code for rag function
test_ds,
llm = get_model_obj('llm', eval_llm),
embeddings = get_model_obj('embedding', eval_embedding),
#TODO: Fetch Run Config settings from advanced settings from front-end
run_config = run_config,
is_async=RUN_CONFIG_IS_ASYNC
)
result=rageval.evaluate()
logger.debug(f'progress_state={progress_state.get_progress()}')

if kwargs['include_granular_combos']:
# Objective function for Bayesian optimization on the custom RAG configurations
@use_named_args(space)
def objective(**params):
config = lc_templates.generate_config_from_params(params)
str_config=json.dumps(config)
score = configs_evaluated.get(str_config, None)
if score:
logger.info(f"Config already evaluated with score: {score}: {config}")
return score

config['loader_kwargs'] = src_data
config['run_id'] = run_id
# logger.info(f"Config raw={config}\n\n")
# logger.info(f"Config={json.dumps(config, indent=4)}\n\n")

progress_state.increment_progress()
logger.info(f"Running: {progress_state.get_progress()['current_run']}/{progress_state.get_progress()['total_runs']}")
logger.info(f"Initializing RAG object...")
rag_builder = RagBuilder(config)
run_config = RunConfig(timeout=RUN_CONFIG_TIMEOUT, max_workers=RUN_CONFIG_MAX_WORKERS, max_wait=RUN_CONFIG_MAX_WAIT, max_retries=RUN_CONFIG_MAX_RETRIES)
# logger.info(f"{repr(run_config)}")

# Dummy run to test config structures
# scores=[0.5890422913, 0.7656478429, 0.5935820215, 0.6100727287, 0.7904418542, 0.8966577465, 0.6205320374, 0.5581511382, 0.5966923152, 0.6609632653, 0.550011964, 0.5402692061, 0.5755822793, 0.6234577837, 0.5905206211, 0.5864179955, 0.6062351971, 0.570672658, 0.7500015656, 0.7747984829, 0.7993104194, 0.781805689, 0.5710751929, 0.6645166332, 0.622714199, 0.6356301621, 0.6241188896, 0.8153687664, 0.6827077848, 0.6959527751, 0.8423843881, 0.9609655913, 0.6698080329, 0.5912493806, 0.7359742148, 0.7080427047, 0.6899119678, 0.6105474717, 0.7208188469, 0.695968622, 0.6869681458, 0.7269693914, 0.7424575424, 0.7011177759, 0.8697962711, 0.8088942748, 0.9005903531, 0.8688290896, 0.6666808804, 0.666883309, 0.6888392867, 0.7296173512, 0.6497820307, 0.9349375798, 0.6906564857, 0.7924750533, 0.8931411951, 0.9462395027, 0.881902146, 0.6423630407, 0.7474532458, 0.8388990762, 0.6705516029, 0.7747971947, 0.7218534451, 0.8823771379, 0.8505055572, 0.6567467535, 0.7043667001, 0.6939435603, 0.8808846607, 0.9005438973, 0.8691391629, 0.9763763024, 0.6278870244, 0.7355142518, 0.7633544088, 0.5913903849, 0.626892352, 0.6987860021, 0.6456495151, 0.7416265216, 0.6446452076, 0.7546382667, 0.800226133, 0.9454843785, 0.9280627528, 0.6740895569, 0.7741376011, 0.7247380601, 0.6472672733, 0.8251968841, 0.9085414624, 0.8238757897, 0.6880305725, 0.6632702383, 0.8470425157, 0.6590755791, 0.7576560761, 0.7567810953]
# try:
# score = scores[int(time.time())%100]
# except:
# score = -1
logger.info(f"Evaluating RAG Config #{progress_state.get_progress()['current_run']}... \n(this may take a while)")
rageval = eval.RagEvaluator(
rag_builder,
test_ds,
llm = get_model_obj('llm', eval_llm),
embeddings = get_model_obj('embedding', eval_embedding),
run_config=run_config,
is_async=RUN_CONFIG_IS_ASYNC
)
# x=input("Continue? ")
# if x.lower() != 'y':
# exit()
score = rageval.evaluate()
if not score:
logger.info(f"Completed evaluation. Adding to configs evaluated...")
configs_evaluated[str_config]=score
return -score # We negate the score because gp_minimize minimizes

# Run Bayesian optimization
logger.info(f"Running Bayesian optimization...")
result = gp_minimize(objective, space, n_calls=num_runs, random_state=42) #, callback=DeltaXStopper(1e-8))
logger.info(f"Completed Bayesian optimization...")

best_params = result.x
best_score = -result.fun

logger.info(f"Best Configuration: {best_params}")
logger.info(f"Best Score: {best_score}")
return 0

def rag_builder_bayes_optimization_optuna(**kwargs):
run_id=kwargs['run_id']
src_data=kwargs['src_data']
Expand Down Expand Up @@ -424,6 +283,12 @@ def objective(trial):
result = rageval.evaluate()
logger.info(f"Completed evaluation. result={result}...")
if 'answer_correctness' in result and result['answer_correctness'] != float('NaN'):
logger.debug("Answer_correctness: ", result.scores["answer_correctness"])
none_records = len(result.scores.filter(lambda x: math.isnan(x['answer_correctness']) if x['answer_correctness'] is not None else False))
percent_none = (none_records * 1.0 / len(result.scores))
if percent_none > 0.2:
logger.warning(f"More than 20% of the records have 'answer_correctness' as None. Skipping this config...")
return float('NaN')
if not progress_state.get_progress()['first_eval_complete']:
progress_state.set_first_eval_complete()
return result['answer_correctness']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ def getMarkdownHeaderTextSplitter(**kwargs):
("##", "Header 2"),
("###", "Header 3")]
{splitter_name} = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
splits={splitter_name}.split_text(docs[0].page_content)"""
splits = []
for doc in docs:
splits.extend({splitter_name}.split_text(doc.page_content))
"""
import_string = f"""from langchain.text_splitter import MarkdownHeaderTextSplitter"""
return {'code_string':code_string,'import_string':import_string}
except KeyError as e:
Expand Down
26 changes: 13 additions & 13 deletions src/ragbuilder/langchain_module/retriever/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def getRetriever(**kwargs):
if any(reranker in document_compressor_pipeline for reranker in rerankers_to_check):
code_string = f"""retriever=c.as_retriever(search_type='{kwargs['search_type']}', search_kwargs={{'k': 100}})"""
else:
print('No Rerankers')
logger.info('No Rerankers')
code_string = f"""retriever=c.as_retriever(search_type='{kwargs['search_type']}', search_kwargs={{'k': {kwargs['search_kwargs']}}})"""
else:
code_string = f"""retriever=c.as_retriever(search_type='{kwargs['search_type']}', search_kwargs={{'k': {kwargs['search_kwargs']}}})"""
Expand Down Expand Up @@ -91,7 +91,7 @@ def getRetriever(**kwargs):
parent_kwargs["chunking_kwargs"]["chunk_overlap"] = kwargs["chunking_kwargs"]["chunk_overlap"]*3
parent_kwargs['splitter_name']="parent_splitter"
parent_chunk_strategy=getChunkingStrategy(**parent_kwargs)
print('parent_chunk_strategy',parent_chunk_strategy)
# print('parent_chunk_strategy',parent_chunk_strategy)
code_string=f"""{parent_chunk_strategy['code_string']}"""
code_string+="""
store = InMemoryStore()
Expand All @@ -107,8 +107,7 @@ def getRetriever(**kwargs):
import_string = f"""from langchain_community.retrievers import BM25Retriever"""
return {'code_string':code_string,'import_string':import_string}
elif retriever_type == "colbertRetriever":
print("Colbert Retriever Invoked")
print("Colbert Retriever Invoked",str(search_kwargs))
logger.info("Colbert Retriever Invoked")
timestamp = str(int(time.time()*1000+random.randint(1, 1000)))
index_name = "testindex-ragbuilder-" + timestamp
code_string= f"""
Expand Down Expand Up @@ -137,7 +136,7 @@ def getCompressors(**kwargs):
- DocumentCompressorPipeline object based on the specified compressors.
"""
compressor_config = kwargs.get('compressor',None)
print(compressor_config)
# print(compressor_config)
search_kwargs=kwargs.get('search_kwargs',None)
arr_transformer=[]
if 'LLMChainExtractor' in compressor_config:
Expand Down Expand Up @@ -174,63 +173,64 @@ def getCompressors(**kwargs):
return {'code_string':code_string,'import_string':import_string}

if 'mixedbread-ai/mxbai-rerank-large-v1' in compressor_config:
code_string= f"""ranker = Reranker("mixedbread-ai/mxbai-rerank-large-v1", verbose=0)
code_string= f"""ranker = Reranker("mixedbread-ai/mxbai-rerank-large-v1", model_type='cross-encoder', verbose=0)
compressor = ranker.as_langchain_compressor(k={search_kwargs})
arr_comp.append(compressor)
"""
import_string = f"""from rerankers import Reranker"""
return {'code_string':code_string,'import_string':import_string}

if 'flashrank' in compressor_config:
code_string= f"""ranker = Reranker("flashrank", verbose=0)
code_string= f"""ranker = Reranker("flashrank", model_type='FlashRankRanker', verbose=0)
compressor = ranker.as_langchain_compressor(k={search_kwargs})
arr_comp.append(compressor)
"""
import_string = f"""from rerankers import Reranker"""
return {'code_string':code_string,'import_string':import_string}

if 'cohere' in compressor_config:
code_string= f"""ranker = Reranker("cohere", lang='en', api_key = os.getenv('COHERE_API_KEY'))
code_string= f"""ranker = Reranker("cohere", model_type='APIRanker', lang='en', api_key = os.getenv('COHERE_API_KEY'))
compressor = ranker.as_langchain_compressor(k={search_kwargs})
arr_comp.append(compressor)
"""
import_string = f"""from rerankers import Reranker"""
return {'code_string':code_string,'import_string':import_string}

if 'jina' in compressor_config:
code_string= f"""ranker = Reranker("jina", api_key = os.getenv('JINA_API_KEY'))
code_string= f"""ranker = Reranker("jina", model_type='APIRanker', api_key = os.getenv('JINA_API_KEY'))
compressor = ranker.as_langchain_compressor(k={search_kwargs})
arr_comp.append(compressor)
"""
import_string = f"""from rerankers import Reranker"""
return {'code_string':code_string,'import_string':import_string}

if 'colbert' in compressor_config:
code_string= f"""ranker = Reranker("colbert")
code_string= f"""ranker = Reranker("colbert", model_type='ColBERTRanker')
compressor = ranker.as_langchain_compressor(k={search_kwargs})
arr_comp.append(compressor)
"""
import_string = f"""from rerankers import Reranker"""
return {'code_string':code_string,'import_string':import_string}

if 'mixedbread-ai/mxbai-rerank-base-v1' in compressor_config:
code_string= f"""ranker = Reranker("cross-encoder")
# code_string= f"""ranker = Reranker("cross-encoder")
code_string= f"""ranker = Reranker("mixedbread-ai/mxbai-rerank-base-v1", model_type='cross-encoder', verbose=1)
compressor = ranker.as_langchain_compressor(k={search_kwargs})
arr_comp.append(compressor)
"""
import_string = f"""from rerankers import Reranker"""
return {'code_string':code_string,'import_string':import_string}

if 'rankllm' in compressor_config:
code_string= f"""ranker = Reranker("rankllm",api_key = os.getenv('OPENAI_API_KEY'))
code_string= f"""ranker = Reranker("rankllm", model_type='RankLLMRanker', api_key = os.getenv('OPENAI_API_KEY'))
compressor = ranker.as_langchain_compressor(k={search_kwargs})
arr_comp.append(compressor)
"""
import_string = f"""from rerankers import Reranker"""
return {'code_string':code_string,'import_string':import_string}

if 'BAAI/bge-reranker-base' in compressor_config:
code_string= f"""ranker = Reranker("BAAI/bge-reranker-base")
code_string= f"""ranker = Reranker("BAAI/bge-reranker-base", model_type='APIRanker')
compressor = ranker.as_langchain_compressor(k={search_kwargs})
arr_comp.append(compressor)
"""
Expand Down
Loading

0 comments on commit 710c53a

Please sign in to comment.