From 60537be9d5c18ab81b680ab13edb7fdf3ce241bf Mon Sep 17 00:00:00 2001 From: Alex Chklovski <54562698+chklovski@users.noreply.github.com> Date: Thu, 27 Apr 2023 14:54:31 +1000 Subject: [PATCH 1/8] pin python version --- checkm2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checkm2.yml b/checkm2.yml index b314366..fad1833 100644 --- a/checkm2.yml +++ b/checkm2.yml @@ -3,7 +3,7 @@ channels: - bioconda - defaults dependencies: - - python>=3.6, <3.9 + - python=3.7 - scikit-learn=0.23.2 - h5py=2.10.0 - numpy=1.19.2 From 4c50f264ae09cf58263712c696b199d62e920148 Mon Sep 17 00:00:00 2001 From: Alex Chklovski <54562698+chklovski@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:04:14 +1000 Subject: [PATCH 2/8] rely on yaml for requirements --- setup.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/setup.py b/setup.py index e7cd726..532343c 100644 --- a/setup.py +++ b/setup.py @@ -13,16 +13,7 @@ include_package_data=True, url='https://github.com/chklovski/CheckM2', license='', - install_requires=('h5py==2.10.0', - 'scikit-learn==0.23.2', - 'numpy>=1.16.4', - 'scipy', - 'pandas', - 'tensorflow', - 'lightgbm', - 'requests', - 'tqdm' - ), + install_requires=(), author='Alex Chklovski', scripts=['bin/checkm2'], author_email='chklovski@gmail.com', From f3690e146b633732c7ce9f7e8fe44abb97a6bdca Mon Sep 17 00:00:00 2001 From: Alex Chklovski <54562698+chklovski@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:04:50 +1000 Subject: [PATCH 3/8] Update checkm2.yml --- checkm2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checkm2.yml b/checkm2.yml index fad1833..6e20e12 100644 --- a/checkm2.yml +++ b/checkm2.yml @@ -3,7 +3,7 @@ channels: - bioconda - defaults dependencies: - - python=3.7 + - python>=3.7, <3.9 - scikit-learn=0.23.2 - h5py=2.10.0 - numpy=1.19.2 From baa322b8d115a39b09f329a53d16758763ec04bd Mon Sep 17 00:00:00 2001 From: Alex Chklovski <54562698+chklovski@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:13:28 +1000 Subject: [PATCH 4/8] strict version requirements --- checkm2.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/checkm2.yml b/checkm2.yml index 6e20e12..8466399 100644 --- a/checkm2.yml +++ b/checkm2.yml @@ -9,10 +9,10 @@ dependencies: - numpy=1.19.2 - diamond=2.0.4 - tensorflow >= 2.1.0, <2.6.0 - - lightgbm = 3.2.1 - - pandas <= 1.4.0 - - scipy - - prodigal>=2.6.3 + - lightgbm=3.2.1 + - pandas=1.4.0 + - scipy=1.8.0 + - prodigal=2.6.3 - setuptools - requests - packaging From 7d1ca77bdd6cd7d9d11e3a4691d0bf91b5ae2e05 Mon Sep 17 00:00:00 2001 From: Alex Chklovski <54562698+chklovski@users.noreply.github.com> Date: Thu, 18 May 2023 15:05:32 +1000 Subject: [PATCH 5/8] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 7561c0f..b28f9cd 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,10 @@ The easiest way to install is using Conda in a new environment: `conda create -n checkm2 -c bioconda -c conda-forge checkm2` +However, conda can be very slow when processing requirements for the environment. A much faster and better way to install CheckM2 is to install using mamba and creating a new environment: + +`mamba create -n checkm2 -c bioconda -c conda-forge checkm2` + CheckM2 is also available on Pypi. To install via pip, use the checkm2.yml file provided in the github to create a new conda environment: `conda env create -n checkm2 -f checkm2.yml` and From 15afa00b193a05824039ba6c2b5690a068b951c9 Mon Sep 17 00:00:00 2001 From: Alex Chklovski <54562698+chklovski@users.noreply.github.com> Date: Thu, 18 May 2023 15:09:11 +1000 Subject: [PATCH 6/8] Update checkm2.yml --- checkm2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checkm2.yml b/checkm2.yml index 8466399..887852c 100644 --- a/checkm2.yml +++ b/checkm2.yml @@ -8,7 +8,7 @@ dependencies: - h5py=2.10.0 - numpy=1.19.2 - diamond=2.0.4 - - tensorflow >= 2.1.0, <2.6.0 + - tensorflow >= 2.2.0, <2.6.0 - lightgbm=3.2.1 - pandas=1.4.0 - scipy=1.8.0 From e6c0d1a24c6c42828f6a4f487664e2487e475eb8 Mon Sep 17 00:00:00 2001 From: Alex Chklovski <54562698+chklovski@users.noreply.github.com> Date: Thu, 18 May 2023 15:36:36 +1000 Subject: [PATCH 7/8] add command-line help messages --- bin/checkm2 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/checkm2 b/bin/checkm2 index 59019bc..617161c 100755 --- a/bin/checkm2 +++ b/bin/checkm2 @@ -72,8 +72,10 @@ if __name__ == '__main__': return subpar - predict_description = 'Predict the completeness and contamination of genome bins in a folder.' - testrun_description = 'Runs Checkm2 on internal test genomes to ensure it runs without errors.' + predict_description = 'Predict the completeness and contamination of genome bins in a folder. Example usage: \n\n' \ + '\tcheckm2 predict --threads 30 --input --output-directory ' + testrun_description = 'Runs Checkm2 on internal test genomes to ensure it runs without errors. Example usage: \n\n' \ + '\t checkm2 testrun --threads 10' download_description = 'Download/set up required diamond database for CheckM2. Example usage: \n\n ' \ '\tcheckm2 database --download (downloads database into /home/user/databases)\n ' \ '\tcheckm2 database --download --path /path/to/custom_location (downloads database into specified folder)\n ' \ From d479db643077c6bba15c76ab6073892e3f506728 Mon Sep 17 00:00:00 2001 From: Alex Chklovski Date: Thu, 18 May 2023 16:31:34 +1000 Subject: [PATCH 8/8] Now reporting total contigs in a bin + max contig length --- bin/checkm2 | 4 ++-- checkm2/predictQuality.py | 25 +++++++++++++++++-------- checkm2/prodigal.py | 8 +++++++- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/bin/checkm2 b/bin/checkm2 index 617161c..2e13197 100755 --- a/bin/checkm2 +++ b/bin/checkm2 @@ -72,9 +72,9 @@ if __name__ == '__main__': return subpar - predict_description = 'Predict the completeness and contamination of genome bins in a folder. Example usage: \n\n' \ + predict_description = 'Predict the completeness and contamination of genome bins in a folder. Example usage: \n\n' \ '\tcheckm2 predict --threads 30 --input --output-directory ' - testrun_description = 'Runs Checkm2 on internal test genomes to ensure it runs without errors. Example usage: \n\n' \ + testrun_description = 'Runs Checkm2 on internal test genomes to ensure it runs without errors. Example usage: \n\n' \ '\t checkm2 testrun --threads 10' download_description = 'Download/set up required diamond database for CheckM2. Example usage: \n\n ' \ '\tcheckm2 database --download (downloads database into /home/user/databases)\n ' \ diff --git a/checkm2/predictQuality.py b/checkm2/predictQuality.py index dfd98ec..46664eb 100644 --- a/checkm2/predictQuality.py +++ b/checkm2/predictQuality.py @@ -100,7 +100,7 @@ def prediction_wf(self, genes_supplied=False, mode='auto', debug_cos=False, used_ttables, coding_density, \ N50, avg_gene_len, \ total_bases, cds_count, \ - GC = self.__run_prodigal(ttable) + GC, totalContigs, maxContigLen = self.__run_prodigal(ttable) prodigal_files, used_ttables = fileManager.verify_prodigal_output(self.prodigal_folder, used_ttables, self.bin_extension) @@ -285,6 +285,8 @@ def prediction_wf(self, genes_supplied=False, mode='auto', debug_cos=False, final_results['Genome_Size'] = final_results['Name'].apply(lambda x: total_bases[x]) final_results['GC_Content'] = final_results['Name'].apply(lambda x: np.round(GC[x], 2)) final_results['Total_Coding_Sequences'] = final_results['Name'].apply(lambda x: cds_count[x]) + final_results['Total_Contigs'] = final_results['Name'].apply(lambda x: totalContigs[x]) + final_results['Max_Contig_Length'] = final_results['Name'].apply(lambda x: maxContigLen[x]) if debug_cos is True: @@ -319,7 +321,7 @@ def __flag_divergent_predictions(self, general, specific, threshold=DefaultValue return compare['Additional_Notes'].values def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, coding_density, - N50, avg_gene_len, total_bases, cds_count, GC): + N50, avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen): while True: bin = queue_in.get(block=True, timeout=None) @@ -328,7 +330,8 @@ def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, cod prodigal_thread = prodigal.ProdigalRunner(self.prodigal_folder, bin) binname, selected_coding_table, c_density, \ - v_N50, v_avg_gene_len, v_total_bases, v_cds_count, v_GC = prodigal_thread.run(bin, ttable) + v_N50, v_avg_gene_len, v_total_bases, v_cds_count, \ + v_GC, v_totalContigs, v_maxContigLen = prodigal_thread.run(bin, ttable) used_ttable[binname] = selected_coding_table coding_density[binname] = c_density @@ -337,8 +340,11 @@ def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, cod total_bases[binname] = v_total_bases GC[binname] = v_GC cds_count[binname] = v_cds_count + totalContigs[binname] = v_totalContigs + maxContigLen[binname] = v_maxContigLen - queue_out.put((bin, selected_coding_table, coding_density, N50, avg_gene_len, total_bases, cds_count, GC)) + queue_out.put((bin, selected_coding_table, coding_density, N50, avg_gene_len, total_bases, cds_count, + GC, totalContigs, maxContigLen)) def __reportProgress(self, total_bins, queueIn): """Report number of processed bins.""" @@ -347,7 +353,7 @@ def __reportProgress(self, total_bins, queueIn): while True: bin, selected_coding_table, coding_density, N50, \ - avg_gene_len, total_bases, cds_count, GC = queueIn.get(block=True, timeout=None) + avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen = queueIn.get(block=True, timeout=None) if bin == None: if logging.root.level == logging.INFO or logging.root.level == logging.DEBUG: sys.stdout.write('\n') @@ -384,6 +390,9 @@ def __run_prodigal(self, ttable): total_bases = mp.Manager().dict() cds_count = mp.Manager().dict() GC = mp.Manager().dict() + totalContigs = mp.Manager().dict() + maxContigLen = mp.Manager().dict() + try: calcProc = [] @@ -392,7 +401,7 @@ def __run_prodigal(self, ttable): mp.Process(target=self.__set_up_prodigal_thread, args=(workerQueue, writerQueue, ttable, used_ttables, coding_density, N50, avg_gene_len, - total_bases, cds_count, GC))) + total_bases, cds_count, GC, totalContigs, maxContigLen))) writeProc = mp.Process(target=self.__reportProgress, args=(len(self.bin_files), writerQueue)) writeProc.start() @@ -403,7 +412,7 @@ def __run_prodigal(self, ttable): for p in calcProc: p.join() - writerQueue.put((None, None, None, None, None, None, None, None)) + writerQueue.put((None, None, None, None, None, None, None, None, None, None)) writeProc.join() except: # make sure all processes are terminated @@ -412,7 +421,7 @@ def __run_prodigal(self, ttable): writeProc.terminate() - return used_ttables, coding_density, N50, avg_gene_len, total_bases, cds_count, GC + return used_ttables, coding_density, N50, avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen def __calculate_metadata(self, faa_files): diff --git a/checkm2/prodigal.py b/checkm2/prodigal.py index aba3342..97d8c77 100644 --- a/checkm2/prodigal.py +++ b/checkm2/prodigal.py @@ -34,6 +34,9 @@ def __init__(self, out_dir, bin_file): def __calculate_N50(self, list_of_lengths): + if np.array(list_of_lengths).mean() == 0: + return 0 + tmp = [] for tmp_number in set(list_of_lengths): tmp += [tmp_number] * list_of_lengths.count(tmp_number) * tmp_number @@ -172,9 +175,12 @@ def run(self, query, supplied_coding_table=None): # if prodigal_input.endswith('.gz'): # shutil.rmtree(tmp_dir) + maxContigLen = np.array(contig_lengths).max() + totalContigs = len(contig_lengths) + return self.file_basename, bestTranslationTable, tableCodingDensity[bestTranslationTable], \ self.__calculate_N50(contig_lengths), np.array(gene_lengths).mean(), totalBases,\ - cds_count, GC + cds_count, GC, totalContigs, maxContigLen def __areORFsCalled(self, aaGeneFile): return os.path.exists(aaGeneFile) and os.stat(aaGeneFile)[stat.ST_SIZE] != 0