Skip to content

Commit

Permalink
Merge pull request #62 from chklovski/checkm2_dev
Browse files Browse the repository at this point in the history
Checkm2 dev - 1.0.2 changes
  • Loading branch information
chklovski authored May 18, 2023
2 parents 866a7e7 + 51ee3d1 commit 5ca337a
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 27 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ The easiest way to install is using Conda in a new environment:

`conda create -n checkm2 -c bioconda -c conda-forge checkm2`

However, conda can be very slow when processing requirements for the environment. A much faster and better way to install CheckM2 is to install using mamba and creating a new environment:

`mamba create -n checkm2 -c bioconda -c conda-forge checkm2`

CheckM2 is also available on Pypi. To install via pip, use the checkm2.yml file provided in the github to create a new conda environment:

`conda env create -n checkm2 -f checkm2.yml` and
Expand Down
6 changes: 4 additions & 2 deletions bin/checkm2
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ if __name__ == '__main__':
return subpar


predict_description = 'Predict the completeness and contamination of genome bins in a folder.'
testrun_description = 'Runs Checkm2 on internal test genomes to ensure it runs without errors.'
predict_description = 'Predict the completeness and contamination of genome bins in a folder. Example usage: \n\n' \
'\tcheckm2 predict --threads 30 --input <folder_with_bins> --output-directory <output_folder>'
testrun_description = 'Runs Checkm2 on internal test genomes to ensure it runs without errors. Example usage: \n\n' \
'\t checkm2 testrun --threads 10'
download_description = 'Download/set up required diamond database for CheckM2. Example usage: \n\n ' \
'\tcheckm2 database --download (downloads database into /home/user/databases)\n ' \
'\tcheckm2 database --download --path /path/to/custom_location (downloads database into specified folder)\n ' \
Expand Down
12 changes: 6 additions & 6 deletions checkm2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@ channels:
- bioconda
- defaults
dependencies:
- python>=3.6, <3.9
- python>=3.7, <3.9
- scikit-learn=0.23.2
- h5py=2.10.0
- numpy=1.19.2
- diamond=2.0.4
- tensorflow >= 2.2.0, <2.6.0
- lightgbm = 3.2.1
- pandas <= 1.4.0
- scipy
- prodigal>=2.6.3
- lightgbm=3.2.1
- pandas=1.4.0
- scipy=1.8.0
- prodigal=2.6.3
- setuptools
- requests
- packaging
- tqdm
- tqdm
25 changes: 17 additions & 8 deletions checkm2/predictQuality.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def prediction_wf(self, genes_supplied=False, mode='auto', debug_cos=False,
used_ttables, coding_density, \
N50, avg_gene_len, \
total_bases, cds_count, \
GC = self.__run_prodigal(ttable)
GC, totalContigs, maxContigLen = self.__run_prodigal(ttable)

prodigal_files, used_ttables = fileManager.verify_prodigal_output(self.prodigal_folder, used_ttables, self.bin_extension)

Expand Down Expand Up @@ -285,6 +285,8 @@ def prediction_wf(self, genes_supplied=False, mode='auto', debug_cos=False,
final_results['Genome_Size'] = final_results['Name'].apply(lambda x: total_bases[x])
final_results['GC_Content'] = final_results['Name'].apply(lambda x: np.round(GC[x], 2))
final_results['Total_Coding_Sequences'] = final_results['Name'].apply(lambda x: cds_count[x])
final_results['Total_Contigs'] = final_results['Name'].apply(lambda x: totalContigs[x])
final_results['Max_Contig_Length'] = final_results['Name'].apply(lambda x: maxContigLen[x])


if debug_cos is True:
Expand Down Expand Up @@ -319,7 +321,7 @@ def __flag_divergent_predictions(self, general, specific, threshold=DefaultValue
return compare['Additional_Notes'].values

def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, coding_density,
N50, avg_gene_len, total_bases, cds_count, GC):
N50, avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen):

while True:
bin = queue_in.get(block=True, timeout=None)
Expand All @@ -328,7 +330,8 @@ def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, cod

prodigal_thread = prodigal.ProdigalRunner(self.prodigal_folder, bin)
binname, selected_coding_table, c_density, \
v_N50, v_avg_gene_len, v_total_bases, v_cds_count, v_GC = prodigal_thread.run(bin, ttable)
v_N50, v_avg_gene_len, v_total_bases, v_cds_count, \
v_GC, v_totalContigs, v_maxContigLen = prodigal_thread.run(bin, ttable)

used_ttable[binname] = selected_coding_table
coding_density[binname] = c_density
Expand All @@ -337,8 +340,11 @@ def __set_up_prodigal_thread(self, queue_in, queue_out, ttable, used_ttable, cod
total_bases[binname] = v_total_bases
GC[binname] = v_GC
cds_count[binname] = v_cds_count
totalContigs[binname] = v_totalContigs
maxContigLen[binname] = v_maxContigLen

queue_out.put((bin, selected_coding_table, coding_density, N50, avg_gene_len, total_bases, cds_count, GC))
queue_out.put((bin, selected_coding_table, coding_density, N50, avg_gene_len, total_bases, cds_count,
GC, totalContigs, maxContigLen))

def __reportProgress(self, total_bins, queueIn):
"""Report number of processed bins."""
Expand All @@ -347,7 +353,7 @@ def __reportProgress(self, total_bins, queueIn):

while True:
bin, selected_coding_table, coding_density, N50, \
avg_gene_len, total_bases, cds_count, GC = queueIn.get(block=True, timeout=None)
avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen = queueIn.get(block=True, timeout=None)
if bin == None:
if logging.root.level == logging.INFO or logging.root.level == logging.DEBUG:
sys.stdout.write('\n')
Expand Down Expand Up @@ -384,6 +390,9 @@ def __run_prodigal(self, ttable):
total_bases = mp.Manager().dict()
cds_count = mp.Manager().dict()
GC = mp.Manager().dict()
totalContigs = mp.Manager().dict()
maxContigLen = mp.Manager().dict()


try:
calcProc = []
Expand All @@ -392,7 +401,7 @@ def __run_prodigal(self, ttable):
mp.Process(target=self.__set_up_prodigal_thread, args=(workerQueue, writerQueue, ttable,
used_ttables, coding_density,
N50, avg_gene_len,
total_bases, cds_count, GC)))
total_bases, cds_count, GC, totalContigs, maxContigLen)))
writeProc = mp.Process(target=self.__reportProgress, args=(len(self.bin_files), writerQueue))

writeProc.start()
Expand All @@ -403,7 +412,7 @@ def __run_prodigal(self, ttable):
for p in calcProc:
p.join()

writerQueue.put((None, None, None, None, None, None, None, None))
writerQueue.put((None, None, None, None, None, None, None, None, None, None))
writeProc.join()
except:
# make sure all processes are terminated
Expand All @@ -412,7 +421,7 @@ def __run_prodigal(self, ttable):

writeProc.terminate()

return used_ttables, coding_density, N50, avg_gene_len, total_bases, cds_count, GC
return used_ttables, coding_density, N50, avg_gene_len, total_bases, cds_count, GC, totalContigs, maxContigLen

def __calculate_metadata(self, faa_files):

Expand Down
8 changes: 7 additions & 1 deletion checkm2/prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def __init__(self, out_dir, bin_file):

def __calculate_N50(self, list_of_lengths):

if np.array(list_of_lengths).mean() == 0:
return 0

tmp = []
for tmp_number in set(list_of_lengths):
tmp += [tmp_number] * list_of_lengths.count(tmp_number) * tmp_number
Expand Down Expand Up @@ -172,9 +175,12 @@ def run(self, query, supplied_coding_table=None):
# if prodigal_input.endswith('.gz'):
# shutil.rmtree(tmp_dir)

maxContigLen = np.array(contig_lengths).max()
totalContigs = len(contig_lengths)

return self.file_basename, bestTranslationTable, tableCodingDensity[bestTranslationTable], \
self.__calculate_N50(contig_lengths), np.array(gene_lengths).mean(), totalBases,\
cds_count, GC
cds_count, GC, totalContigs, maxContigLen

def __areORFsCalled(self, aaGeneFile):
return os.path.exists(aaGeneFile) and os.stat(aaGeneFile)[stat.ST_SIZE] != 0
Expand Down
11 changes: 1 addition & 10 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,7 @@
include_package_data=True,
url='https://github.com/chklovski/CheckM2',
license='',
install_requires=('h5py==2.10.0',
'scikit-learn==0.23.2',
'numpy>=1.16.4',
'scipy',
'pandas',
'tensorflow',
'lightgbm',
'requests',
'tqdm'
),
install_requires=(),
author='Alex Chklovski',
scripts=['bin/checkm2'],
author_email='[email protected]',
Expand Down

0 comments on commit 5ca337a

Please sign in to comment.