diff --git a/README.md b/README.md index 7561c0f..b28f9cd 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,10 @@ The easiest way to install is using Conda in a new environment: `conda create -n checkm2 -c bioconda -c conda-forge checkm2` +However, conda can be very slow when processing requirements for the environment. A much faster and better way to install CheckM2 is to install using mamba and creating a new environment: + +`mamba create -n checkm2 -c bioconda -c conda-forge checkm2` + CheckM2 is also available on Pypi. To install via pip, use the checkm2.yml file provided in the github to create a new conda environment: `conda env create -n checkm2 -f checkm2.yml` and diff --git a/bin/checkm2 b/bin/checkm2 index 59019bc..2e13197 100755 --- a/bin/checkm2 +++ b/bin/checkm2 @@ -72,8 +72,10 @@ if __name__ == '__main__': return subpar - predict_description = 'Predict the completeness and contamination of genome bins in a folder.' - testrun_description = 'Runs Checkm2 on internal test genomes to ensure it runs without errors.' + predict_description = 'Predict the completeness and contamination of genome bins in a folder. Example usage: \n\n' \ + '\tcheckm2 predict --threads 30 --input --output-directory ' + testrun_description = 'Runs Checkm2 on internal test genomes to ensure it runs without errors. Example usage: \n\n' \ + '\t checkm2 testrun --threads 10' download_description = 'Download/set up required diamond database for CheckM2. Example usage: \n\n ' \ '\tcheckm2 database --download (downloads database into /home/user/databases)\n ' \ '\tcheckm2 database --download --path /path/to/custom_location (downloads database into specified folder)\n ' \ diff --git a/checkm2.yml b/checkm2.yml index 4e539d4..c093859 100644 --- a/checkm2.yml +++ b/checkm2.yml @@ -3,17 +3,17 @@ channels: - bioconda - defaults dependencies: - - python>=3.6, <3.9 + - python>=3.7, <3.9 - scikit-learn=0.23.2 - h5py=2.10.0 - numpy=1.19.2 - diamond=2.0.4 - tensorflow >= 2.2.0, <2.6.0 - - lightgbm = 3.2.1 - - pandas <= 1.4.0 - - scipy - - prodigal>=2.6.3 + - lightgbm=3.2.1 + - pandas=1.4.0 + - scipy=1.8.0 + - prodigal=2.6.3 - setuptools - requests - packaging - - tqdm + - tqdm \ No newline at end of file diff --git a/checkm2/predictQuality.py b/checkm2/predictQuality.py index dfd98ec..46664eb 100644 --- a/checkm2/predictQuality.py +++ b/checkm2/predictQuality.py @@ -100,7 +100,7 @@ def prediction_wf(self, genes_supplied=False, mode='auto', debug_cos=False, used_ttables, coding_density, \ N50, avg_gene_len, \ total_bases, cds_count, \ - GC = self.__run_prodigal(ttable) + GC, totalContigs, maxContigLen = self.__run_prodigal(ttable) prodigal_files, used_ttables = fileManager.verify_prodigal_output(self.prodigal_folder, used_ttables, self.bin_extension) @@ -285,6 +285,8 @@ def prediction_wf(self, genes_supplied=False, mode='auto', debug_cos=False, final_results['Genome_Size'] = final_results['Name'].apply(lambda x: total_bases[x]) final_results['GC_Content'] = final_results['Name'].apply(lambda x: np.round(GC[x], 2)) final_results['Total_Coding_Sequences'] = final_results['Name'].apply(lambda x: cds_count[x]) + final_results['Total_Contigs'] = final_results['Name'].apply(lambda x: totalContigs[x]) + final_results['Max_Contig_Length'] = final_results['Name'].apply(lambda x: maxContigLen[x]) if debug_cos is True: @@ -319,7 +321,7 @@ def __flag_divergent_predictions(self, general, specific, threshold=DefaultValue return compare['Additional_Notes'].values def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, coding_density, - N50, avg_gene_len, total_bases, cds_count, GC): + N50, avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen): while True: bin = queue_in.get(block=True, timeout=None) @@ -328,7 +330,8 @@ def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, cod prodigal_thread = prodigal.ProdigalRunner(self.prodigal_folder, bin) binname, selected_coding_table, c_density, \ - v_N50, v_avg_gene_len, v_total_bases, v_cds_count, v_GC = prodigal_thread.run(bin, ttable) + v_N50, v_avg_gene_len, v_total_bases, v_cds_count, \ + v_GC, v_totalContigs, v_maxContigLen = prodigal_thread.run(bin, ttable) used_ttable[binname] = selected_coding_table coding_density[binname] = c_density @@ -337,8 +340,11 @@ def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, cod total_bases[binname] = v_total_bases GC[binname] = v_GC cds_count[binname] = v_cds_count + totalContigs[binname] = v_totalContigs + maxContigLen[binname] = v_maxContigLen - queue_out.put((bin, selected_coding_table, coding_density, N50, avg_gene_len, total_bases, cds_count, GC)) + queue_out.put((bin, selected_coding_table, coding_density, N50, avg_gene_len, total_bases, cds_count, + GC, totalContigs, maxContigLen)) def __reportProgress(self, total_bins, queueIn): """Report number of processed bins.""" @@ -347,7 +353,7 @@ def __reportProgress(self, total_bins, queueIn): while True: bin, selected_coding_table, coding_density, N50, \ - avg_gene_len, total_bases, cds_count, GC = queueIn.get(block=True, timeout=None) + avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen = queueIn.get(block=True, timeout=None) if bin == None: if logging.root.level == logging.INFO or logging.root.level == logging.DEBUG: sys.stdout.write('\n') @@ -384,6 +390,9 @@ def __run_prodigal(self, ttable): total_bases = mp.Manager().dict() cds_count = mp.Manager().dict() GC = mp.Manager().dict() + totalContigs = mp.Manager().dict() + maxContigLen = mp.Manager().dict() + try: calcProc = [] @@ -392,7 +401,7 @@ def __run_prodigal(self, ttable): mp.Process(target=self.__set_up_prodigal_thread, args=(workerQueue, writerQueue, ttable, used_ttables, coding_density, N50, avg_gene_len, - total_bases, cds_count, GC))) + total_bases, cds_count, GC, totalContigs, maxContigLen))) writeProc = mp.Process(target=self.__reportProgress, args=(len(self.bin_files), writerQueue)) writeProc.start() @@ -403,7 +412,7 @@ def __run_prodigal(self, ttable): for p in calcProc: p.join() - writerQueue.put((None, None, None, None, None, None, None, None)) + writerQueue.put((None, None, None, None, None, None, None, None, None, None)) writeProc.join() except: # make sure all processes are terminated @@ -412,7 +421,7 @@ def __run_prodigal(self, ttable): writeProc.terminate() - return used_ttables, coding_density, N50, avg_gene_len, total_bases, cds_count, GC + return used_ttables, coding_density, N50, avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen def __calculate_metadata(self, faa_files): diff --git a/checkm2/prodigal.py b/checkm2/prodigal.py index aba3342..97d8c77 100644 --- a/checkm2/prodigal.py +++ b/checkm2/prodigal.py @@ -34,6 +34,9 @@ def __init__(self, out_dir, bin_file): def __calculate_N50(self, list_of_lengths): + if np.array(list_of_lengths).mean() == 0: + return 0 + tmp = [] for tmp_number in set(list_of_lengths): tmp += [tmp_number] * list_of_lengths.count(tmp_number) * tmp_number @@ -172,9 +175,12 @@ def run(self, query, supplied_coding_table=None): # if prodigal_input.endswith('.gz'): # shutil.rmtree(tmp_dir) + maxContigLen = np.array(contig_lengths).max() + totalContigs = len(contig_lengths) + return self.file_basename, bestTranslationTable, tableCodingDensity[bestTranslationTable], \ self.__calculate_N50(contig_lengths), np.array(gene_lengths).mean(), totalBases,\ - cds_count, GC + cds_count, GC, totalContigs, maxContigLen def __areORFsCalled(self, aaGeneFile): return os.path.exists(aaGeneFile) and os.stat(aaGeneFile)[stat.ST_SIZE] != 0 diff --git a/setup.py b/setup.py index e7cd726..532343c 100644 --- a/setup.py +++ b/setup.py @@ -13,16 +13,7 @@ include_package_data=True, url='https://github.com/chklovski/CheckM2', license='', - install_requires=('h5py==2.10.0', - 'scikit-learn==0.23.2', - 'numpy>=1.16.4', - 'scipy', - 'pandas', - 'tensorflow', - 'lightgbm', - 'requests', - 'tqdm' - ), + install_requires=(), author='Alex Chklovski', scripts=['bin/checkm2'], author_email='chklovski@gmail.com',