Skip to content

Commit

Permalink
adding progress bar to duckdb
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Nov 27, 2023
1 parent 3157730 commit 0e860d2
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 16 deletions.
16 changes: 8 additions & 8 deletions python/quantmsio/quantms_io/commands/diann_convert_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ def cli():
required=True,
)
@click.option("--output_prefix_file", help="Prefix of the Json file needed to generate the file name", required=False)
@click.option("--max_memory", help= "The maximum amount of memory allocated", default = '32GB')
@click.option("--worker_threads", help= "The threads of worker", default = 4)
@click.option("--duckdb_max_memory", help= "The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)")
@click.option("--duckdb_threads", help= "The number of threads for the DuckDB engine (e.g 4)")
@click.option("--file_num", help= "The number of files being processed at the same time", default = 100)
@click.pass_context
def diann_convert_to_parquet(ctx, report_path: str, design_file: str, modifications:List, qvalue_threshold: float,
mzml_info_folder:str, sdrf_path:str, output_folder:str, output_prefix_file:str,max_memory:str,worker_threads:int,
file_num:int ):
mzml_info_folder:str, sdrf_path:str, output_folder:str, output_prefix_file:str,
duckdb_max_memory:str, duckdb_threads:int, file_num:int ):
'''
report_path: diann report file path
design_file: the disign file path
Expand All @@ -67,8 +67,8 @@ def diann_convert_to_parquet(ctx, report_path: str, design_file: str, modificati
sdrf_path: sdrf file path
output_folder: Folder where the Json file will be generated
output_prefix_file: Prefix of the Json file needed to generate the file name
max_memory: The maximum amount of memory allocated
worker_threads: The threads of worker
duckdb_max_memory: The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)
duckdb_threads: The number of threads for the DuckDB engine (e.g 4)
file_num: The number of files being processed at the same time
'''
if not os.path.exists(output_folder):
Expand All @@ -91,8 +91,8 @@ def diann_convert_to_parquet(ctx, report_path: str, design_file: str, modificati
sdrf_path = sdrf_path,
psm_output_path=psm_output_path,
feature_output_path = feature_output_path,
max_memory = max_memory,
worker_threads = worker_threads,
max_memory = duckdb_max_memory,
worker_threads = duckdb_threads,
file_num = file_num
)

Expand Down
20 changes: 13 additions & 7 deletions python/quantmsio/quantms_io/core/diann_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,17 +170,23 @@ def create_duckdb_from_diann_report(report_path,max_memory,worker_threads):
:return: A duckdb database
"""
s = time.time()
database = duckdb.connect(config={
'max_memory': max_memory,
'worker_threads': worker_threads
})

database = duckdb.connect()
database.execute("SET enable_progress_bar=true")

if max_memory is not None:
database.execute("SET max_memory='{}'".format(max_memory))
if worker_threads is not None:
database.execute("SET worker_threads='{}'".format(worker_threads))

msg = database.execute("SELECT * FROM duckdb_settings() where name in ('worker_threads', 'max_memory')").df()
logging.info('Duckdb uses {} threads.'.format(str(msg['value'][0])))
logging.info('duckdb uses {} of memory.'.format(str(msg['value'][1])))
database.execute("SET enable_progress_bar=true")

database.execute("CREATE TABLE diann_report AS SELECT * FROM '{}'".format(report_path))
database.execute("""CREATE INDEX idx_precursor_q ON diann_report ("Precursor.Id", "Q.Value")""")
database.execute("""CREATE INDEX idx_run ON diann_report ("Run")""")

et = time.time() - s
logging.info('Time to create duckdb database {} seconds'.format(et))
return database
Expand Down Expand Up @@ -322,11 +328,11 @@ def intergrate_msg(n):

def generate_psm_and_feature_file(self, report_path: str, qvalue_threshold: float, mzml_info_folder: str,
design_file: str, modifications:list, sdrf_path:str, psm_output_path:str,
feature_output_path:str, max_memory:str='8GB',worker_threads:int=4,file_num:int=100):
feature_output_path:str, max_memory:str = None, worker_threads:int = None,file_num:int=2):
psm_pqwriter = None
feature_pqwriter = None

self._duckdb = create_duckdb_from_diann_report(report_path,max_memory,worker_threads)
self._duckdb = create_duckdb_from_diann_report(report_path, max_memory, worker_threads)

s_data_frame, f_table = get_exp_design_dfs(design_file)
self._modifications = get_modifications(modifications[0], modifications[1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_all_required_arguments(self):
modifications=modifications, qvalue_threshold=qvalue_threshold,
feature_output_path=feature_output_path,
psm_output_path=psm_output_path, sdrf_path=sdrf_file,
mzml_info_folder=mzml_info_folder, thread_num=10)
mzml_info_folder=mzml_info_folder, file_num=10)
et = time.time()

# get the execution time
Expand Down

0 comments on commit 0e860d2

Please sign in to comment.