From 18dcfe34fac99d24907168d01a50b5b13d2f16ef Mon Sep 17 00:00:00 2001 From: CBeelen Date: Fri, 23 Jul 2021 16:36:14 -0700 Subject: [PATCH 01/18] Use Haploflow instead of IVA --- micall/core/denovo.py | 23 ++++++----------------- requirements.txt | 1 + 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index c3ce3c9c7..807c91935 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -20,7 +20,7 @@ from micall.core.project_config import ProjectConfig -IVA = "iva" +HAPLOFLOW = "haploflow" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), '..', 'blast_db', @@ -201,26 +201,15 @@ def denovo(fastq1_path: str, '--interleave', '-o', joined_path], check=True) - iva_out_path = os.path.join(tmp_dir, 'iva_out') - contigs_fasta_path = os.path.join(iva_out_path, 'contigs.fasta') - iva_args = [IVA, '--fr', joined_path, '-t', '2'] - if merged_contigs_csv is not None: - seeds_fasta_path = os.path.join(tmp_dir, 'seeds.fasta') - with open(seeds_fasta_path, 'w') as seeds_fasta: - SeqIO.write((SeqRecord(Seq(row['contig']), f'seed-{i}', '', '') - for i, row in enumerate(DictReader(merged_contigs_csv))), - seeds_fasta, - 'fasta') - seeds_size = seeds_fasta.tell() - if seeds_size > 0: - iva_args.extend(['--contigs', seeds_fasta_path, '--make_new_seeds']) - iva_args.append(iva_out_path) + haplo_out_path = os.path.join(tmp_dir, 'haplo_out') + contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') + haplo_args = [HAPLOFLOW, '--read-file', joined_path, '--out', haplo_out_path] try: - run(iva_args, check=True, stdout=PIPE, stderr=STDOUT) + run(haplo_args, check=True, stdout=PIPE, stderr=STDOUT) except CalledProcessError as ex: output = ex.output and ex.output.decode('UTF8') if output != 'Failed to make first seed. Cannot continue\n': - logger.warning('iva failed to assemble.', exc_info=True) + logger.warning('Haploflow failed to assemble.', exc_info=True) logger.warning(output) with open(contigs_fasta_path, 'a'): pass diff --git a/requirements.txt b/requirements.txt index 377d82966..1f8e66cdb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ reportlab==3.5.68 pysam==0.16.0.1 git+https://github.com/cfe-lab/genetracks.git@v0.2.dev0 mappy==2.21 +git+https://github.com/hzi-bifo/Haploflow.git From 8eb81be750effb07a677c4fb29f36c1e356dcfd1 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 4 Aug 2021 14:58:00 -0700 Subject: [PATCH 02/18] Read in Haploflow's input arguments --- micall/core/denovo.py | 21 +++++++++++++++++++-- micall/drivers/sample.py | 10 ++++++---- micall_docker.py | 39 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 807c91935..5e727d01b 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -176,7 +176,8 @@ def denovo(fastq1_path: str, contigs_csv: typing.TextIO, work_dir: str = '.', merged_contigs_csv: typing.TextIO = None, - blast_csv: typing.TextIO = None): + blast_csv: typing.TextIO = None, + haplo_args=None): """ Use de novo assembly to build contigs from reads. :param fastq1_path: FASTQ file name for read 1 reads @@ -201,9 +202,25 @@ def denovo(fastq1_path: str, '--interleave', '-o', joined_path], check=True) + + if haplo_args is None: + haplo_args = {'long': 0, + 'filter': 500, + 'thres': -1, + 'strict': 5, + 'error': 2, + 'kmer': 41} haplo_out_path = os.path.join(tmp_dir, 'haplo_out') contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') - haplo_args = [HAPLOFLOW, '--read-file', joined_path, '--out', haplo_out_path] + haplo_args = [HAPLOFLOW, + '--read-file', joined_path, + '--out', haplo_out_path, + '--k', haplo_args['kmer'], + '--error-rate', haplo_args['error'], + '--strict', haplo_args['strict'], + '--filter', haplo_args['filter'], + '--thres', haplo_args['thres'], + '--long', haplo_args['long']] try: run(haplo_args, check=True, stdout=PIPE, stderr=STDOUT) except CalledProcessError as ex: diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index 714e6cec0..fc605380e 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -109,7 +109,8 @@ def process(self, excluded_seeds=(), excluded_projects=(), force_gzip=False, - use_denovo=False): + use_denovo=False, + haplo_args=None): """ Process a single sample. :param pssm: the pssm library for running G2P analysis @@ -171,7 +172,7 @@ def process(self, merged_contigs_csv=merged_contigs_csv) if use_denovo: - self.run_denovo(excluded_seeds) + self.run_denovo(excluded_seeds, haplo_args=haplo_args) else: self.run_mapping(excluded_seeds) @@ -309,7 +310,7 @@ def run_mapping(self, excluded_seeds): scratch_path, debug_file_prefix=debug_file_prefix) - def run_denovo(self, excluded_seeds): + def run_denovo(self, excluded_seeds, haplo_args=None): logger.info('Running de novo assembly on %s.', self) scratch_path = self.get_scratch_path() with open(self.merged_contigs_csv) as merged_contigs_csv, \ @@ -320,7 +321,8 @@ def run_denovo(self, excluded_seeds): contigs_csv, self.scratch_path, merged_contigs_csv, - blast_csv=blast_csv) + blast_csv=blast_csv, + haplo_args=haplo_args) logger.info('Running remap on %s.', self) if self.debug_remap: debug_file_prefix = os.path.join(scratch_path, 'debug') diff --git a/micall_docker.py b/micall_docker.py index 4ada18389..034264aa9 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -343,6 +343,36 @@ def get_parser(default_max_active): "--project_code", "-p", help="Select primers to trim: HCV or SARSCOV2.") + command_parser.add_argument( + "-haplo_long", + type=int, + default=0, + ) + command_parser.add_argument( + "-haplo_filter", + type=int, + default=500, + ) + command_parser.add_argument( + "-haplo_thres", + type=int, + default=-1, + ) + command_parser.add_argument( + "-haplo_strict", + type=int, + default=5, + ) + command_parser.add_argument( + "-haplo_error", + type=int, + default=2, + ) + command_parser.add_argument( + "-haplo_kmer", + type=int, + default=41, + ) return parser @@ -943,13 +973,20 @@ def process_sample(sample, args, pssm, use_denovo=False): """ sample.debug_remap = args.debug_remap sample.skip = args.skip + args_haplo = {'long': args.haplo_long, + 'filter': args.haplo_filter, + 'thres': args.haplo_thres, + 'strict': args.haplo_strict, + 'error': args.haplo_error, + 'kmer': args.haplo_kmer} try: excluded_seeds = [] if args.all_projects else EXCLUDED_SEEDS excluded_projects = [] if args.all_projects else EXCLUDED_PROJECTS sample.process(pssm, excluded_seeds, excluded_projects, - use_denovo=use_denovo) + use_denovo=use_denovo, + haplo_args=args_haplo) except Exception: message = 'Failed to process {}.'.format(sample) logger.error(message, exc_info=True) From 550bb55b6abfcc06a7f48c1b5e8e791f5966af03 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 4 Aug 2021 15:53:01 -0700 Subject: [PATCH 03/18] Add debug mode to generate graphs --- micall/core/denovo.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 5e727d01b..1c9657fb7 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -215,12 +215,13 @@ def denovo(fastq1_path: str, haplo_args = [HAPLOFLOW, '--read-file', joined_path, '--out', haplo_out_path, - '--k', haplo_args['kmer'], - '--error-rate', haplo_args['error'], - '--strict', haplo_args['strict'], - '--filter', haplo_args['filter'], - '--thres', haplo_args['thres'], - '--long', haplo_args['long']] + '--k', str(haplo_args['kmer']), + '--error-rate', str(haplo_args['error']), + '--strict', str(haplo_args['strict']), + '--filter', str(haplo_args['filter']), + '--thres', str(haplo_args['thres']), + '--long', str(haplo_args['long']), + '--debug', '1'] try: run(haplo_args, check=True, stdout=PIPE, stderr=STDOUT) except CalledProcessError as ex: From b0a7e0a13651f12d06a10c8b420ee6fbb34da28c Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 4 Aug 2021 16:03:44 -0700 Subject: [PATCH 04/18] Correct default error rate --- micall/core/denovo.py | 2 +- micall_docker.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 1c9657fb7..407956c30 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -208,7 +208,7 @@ def denovo(fastq1_path: str, 'filter': 500, 'thres': -1, 'strict': 5, - 'error': 2, + 'error': 0.02, 'kmer': 41} haplo_out_path = os.path.join(tmp_dir, 'haplo_out') contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') diff --git a/micall_docker.py b/micall_docker.py index 034264aa9..3b076169c 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -365,8 +365,8 @@ def get_parser(default_max_active): ) command_parser.add_argument( "-haplo_error", - type=int, - default=2, + type=float, + default=0.02, ) command_parser.add_argument( "-haplo_kmer", From 7960ef9dab6808711cf9504facc78c4aad27f95c Mon Sep 17 00:00:00 2001 From: CBeelen Date: Mon, 9 Aug 2021 14:19:09 -0700 Subject: [PATCH 05/18] Trim and merge contigs after assembly using IVA's tools --- micall/core/denovo.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 407956c30..ab8b99d48 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -20,6 +20,12 @@ from micall.core.project_config import ProjectConfig +from tempfile import mkstemp +from shutil import move, copymode +from os import fdopen, remove +from iva.assembly import Assembly +from pyfastaq.tasks import deinterleave + HAPLOFLOW = "haploflow" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), '..', @@ -232,6 +238,32 @@ def denovo(fastq1_path: str, with open(contigs_fasta_path, 'a'): pass + fh, abs_path = mkstemp() + i = 0 + with fdopen(fh, 'w') as new_file: + with open(contigs_fasta_path) as old_file: + for line in old_file: + if line.startswith('>'): + new_file.write(f">contig{i}\n") + i += 1 + else: + new_file.write(line) + copymode(contigs_fasta_path, abs_path) + remove(contigs_fasta_path) + move(abs_path, contigs_fasta_path) + print(f"Number of contigs before trimming and joining: {i}") + + haplo_assembly = Assembly(contigs_file=contigs_fasta_path) + reads_prefix = os.path.join(tmp_dir, 'reads') + reads_1 = reads_prefix + '_1.fa' + reads_2 = reads_prefix + '_2.fa' + deinterleave(joined_path, reads_1, reads_2, fasta_out=True) + haplo_assembly._trim_strand_biased_ends(reads_prefix, tag_as_trimmed=True) + haplo_assembly._remove_contained_contigs(list(haplo_assembly.contigs.keys())) + haplo_assembly._merge_overlapping_contigs(list(haplo_assembly.contigs.keys())) + contigs_fasta_path = os.path.join(haplo_out_path, 'finalcontigs.fasta') + haplo_assembly.write_contigs_to_file(contigs_fasta_path) + os.chdir(start_dir) duration = datetime.now() - start_time contig_count = write_contig_refs(contigs_fasta_path, From 8a36f336513bc86b467982a2e0c3f1623d28840e Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 11 Aug 2021 13:07:33 -0700 Subject: [PATCH 06/18] Add optional scaffolding and patching --- micall/core/denovo.py | 93 ++++++++++++++++++++++++++++--------------- micall_docker.py | 23 ++++++++++- 2 files changed, 84 insertions(+), 32 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index ab8b99d48..ec1d0bc20 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -27,6 +27,7 @@ from pyfastaq.tasks import deinterleave HAPLOFLOW = "haploflow" +RAGTAG = ".venv/bin/RagTag/ragtag.py" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), '..', 'blast_db', @@ -211,14 +212,18 @@ def denovo(fastq1_path: str, if haplo_args is None: haplo_args = {'long': 0, - 'filter': 500, - 'thres': -1, - 'strict': 5, - 'error': 0.02, - 'kmer': 41} + 'filter': 500, + 'thres': -1, + 'strict': 5, + 'error': 0.02, + 'kmer': 41, + 'merge': False, + 'scaffold': False, + 'patch': False, + 'ref': None} haplo_out_path = os.path.join(tmp_dir, 'haplo_out') contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') - haplo_args = [HAPLOFLOW, + haplo_cmd = [HAPLOFLOW, '--read-file', joined_path, '--out', haplo_out_path, '--k', str(haplo_args['kmer']), @@ -229,7 +234,7 @@ def denovo(fastq1_path: str, '--long', str(haplo_args['long']), '--debug', '1'] try: - run(haplo_args, check=True, stdout=PIPE, stderr=STDOUT) + run(haplo_cmd, check=True, stdout=PIPE, stderr=STDOUT) except CalledProcessError as ex: output = ex.output and ex.output.decode('UTF8') if output != 'Failed to make first seed. Cannot continue\n': @@ -238,31 +243,57 @@ def denovo(fastq1_path: str, with open(contigs_fasta_path, 'a'): pass - fh, abs_path = mkstemp() - i = 0 - with fdopen(fh, 'w') as new_file: - with open(contigs_fasta_path) as old_file: - for line in old_file: - if line.startswith('>'): - new_file.write(f">contig{i}\n") - i += 1 - else: - new_file.write(line) - copymode(contigs_fasta_path, abs_path) - remove(contigs_fasta_path) - move(abs_path, contigs_fasta_path) - print(f"Number of contigs before trimming and joining: {i}") + if haplo_args['merge']: + fh, abs_path = mkstemp() + i = 0 + with fdopen(fh, 'w') as new_file: + with open(contigs_fasta_path) as old_file: + for line in old_file: + if line.startswith('>'): + new_file.write(f">contig{i}\n") + i += 1 + else: + new_file.write(line) + copymode(contigs_fasta_path, abs_path) + remove(contigs_fasta_path) + move(abs_path, contigs_fasta_path) + print(f"Number of contigs before trimming and joining: {i}") - haplo_assembly = Assembly(contigs_file=contigs_fasta_path) - reads_prefix = os.path.join(tmp_dir, 'reads') - reads_1 = reads_prefix + '_1.fa' - reads_2 = reads_prefix + '_2.fa' - deinterleave(joined_path, reads_1, reads_2, fasta_out=True) - haplo_assembly._trim_strand_biased_ends(reads_prefix, tag_as_trimmed=True) - haplo_assembly._remove_contained_contigs(list(haplo_assembly.contigs.keys())) - haplo_assembly._merge_overlapping_contigs(list(haplo_assembly.contigs.keys())) - contigs_fasta_path = os.path.join(haplo_out_path, 'finalcontigs.fasta') - haplo_assembly.write_contigs_to_file(contigs_fasta_path) + haplo_assembly = Assembly(contigs_file=contigs_fasta_path) + reads_prefix = os.path.join(tmp_dir, 'reads') + reads_1 = reads_prefix + '_1.fa' + reads_2 = reads_prefix + '_2.fa' + deinterleave(joined_path, reads_1, reads_2, fasta_out=True) + haplo_assembly._trim_strand_biased_ends(reads_prefix, tag_as_trimmed=True) + haplo_assembly._remove_contained_contigs(list(haplo_assembly.contigs.keys())) + haplo_assembly._merge_overlapping_contigs(list(haplo_assembly.contigs.keys())) + contigs_fasta_path = os.path.join(haplo_out_path, 'contigs_merged.fasta') + haplo_assembly.write_contigs_to_file(contigs_fasta_path) + + if haplo_args['scaffold']: + new_contigs_fasta_path = os.path.join(haplo_out_path, 'contigs_scaffolded.fasta') + scaffold_cmd = ['python3.8', + RAGTAG, + 'scaffold', + haplo_args['ref'], + contigs_fasta_path, + '-o', new_contigs_fasta_path, + '--aligner', 'nucmer', + '--nucmer_params', "'--maxmatch -l 100 -c 65'"] + run(scaffold_cmd, check=True, stdout=PIPE, stderr=STDOUT) + contigs_fasta_path = new_contigs_fasta_path + + if haplo_args['patch']: + new_contigs_fasta_path = os.path.join(haplo_out_path, 'contigs_patched.fasta') + patch_cmd = ['python3.8', + RAGTAG, + 'patch', + contigs_fasta_path, + haplo_args['ref'], + '-o', new_contigs_fasta_path, + '--nucmer_params', "'--maxmatch -l 100 -c 65'"] + run(patch_cmd, check=True, stdout=PIPE, stderr=STDOUT) + contigs_fasta_path = new_contigs_fasta_path os.chdir(start_dir) duration = datetime.now() - start_time diff --git a/micall_docker.py b/micall_docker.py index 3b076169c..1ba0cd594 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -373,6 +373,23 @@ def get_parser(default_max_active): type=int, default=41, ) + command_parser.add_argument( + "-denovo_merge", + action='store_true', + ) + command_parser.add_argument( + "-scaffold", + action='store_true', + ) + command_parser.add_argument( + "-patch", + action='store_true', + ) + command_parser.add_argument( + "-ref", + type=str, + default=None, + ) return parser @@ -978,7 +995,11 @@ def process_sample(sample, args, pssm, use_denovo=False): 'thres': args.haplo_thres, 'strict': args.haplo_strict, 'error': args.haplo_error, - 'kmer': args.haplo_kmer} + 'kmer': args.haplo_kmer, + 'merge':args.denovo_merge, + 'scaffold': args.scaffold, + 'patch': args.patch, + 'ref': args.ref} try: excluded_seeds = [] if args.all_projects else EXCLUDED_SEEDS excluded_projects = [] if args.all_projects else EXCLUDED_PROJECTS From d8b1e46cf110d319c0ce4c99102e2673a4b12d44 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 11 Aug 2021 14:27:23 -0700 Subject: [PATCH 07/18] Correct contig paths --- micall/core/denovo.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index ec1d0bc20..c67d91783 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -271,29 +271,29 @@ def denovo(fastq1_path: str, haplo_assembly.write_contigs_to_file(contigs_fasta_path) if haplo_args['scaffold']: - new_contigs_fasta_path = os.path.join(haplo_out_path, 'contigs_scaffolded.fasta') + scaffolding_path = os.path.join(haplo_out_path, 'scaffolding') scaffold_cmd = ['python3.8', RAGTAG, 'scaffold', haplo_args['ref'], contigs_fasta_path, - '-o', new_contigs_fasta_path, + '-o', scaffolding_path, '--aligner', 'nucmer', '--nucmer_params', "'--maxmatch -l 100 -c 65'"] run(scaffold_cmd, check=True, stdout=PIPE, stderr=STDOUT) - contigs_fasta_path = new_contigs_fasta_path + contigs_fasta_path = os.path.join(scaffolding_path, 'ragtag.scaffold.fasta') if haplo_args['patch']: - new_contigs_fasta_path = os.path.join(haplo_out_path, 'contigs_patched.fasta') + patching_path = os.path.join(haplo_out_path, 'patching') patch_cmd = ['python3.8', RAGTAG, 'patch', contigs_fasta_path, haplo_args['ref'], - '-o', new_contigs_fasta_path, + '-o', patching_path, '--nucmer_params', "'--maxmatch -l 100 -c 65'"] run(patch_cmd, check=True, stdout=PIPE, stderr=STDOUT) - contigs_fasta_path = new_contigs_fasta_path + contigs_fasta_path = os.path.join(patching_path, 'ragtag.patch.fasta') os.chdir(start_dir) duration = datetime.now() - start_time From 06c043ffa1ab1fe20cfa7f4dff22f55ab54ad44d Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 11 Aug 2021 16:16:23 -0700 Subject: [PATCH 08/18] Handle unsuccessful scaffolding or patching --- micall/core/denovo.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index c67d91783..a43a1e6b4 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -27,7 +27,7 @@ from pyfastaq.tasks import deinterleave HAPLOFLOW = "haploflow" -RAGTAG = ".venv/bin/RagTag/ragtag.py" +RAGTAG = "/home/charlotte/Documents/Git/MiCall/.venv/bin/RagTag/ragtag.py" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), '..', 'blast_db', @@ -279,9 +279,13 @@ def denovo(fastq1_path: str, contigs_fasta_path, '-o', scaffolding_path, '--aligner', 'nucmer', - '--nucmer_params', "'--maxmatch -l 100 -c 65'"] - run(scaffold_cmd, check=True, stdout=PIPE, stderr=STDOUT) - contigs_fasta_path = os.path.join(scaffolding_path, 'ragtag.scaffold.fasta') + '--nucmer-params', "'--maxmatch -l 100 -c 65'"] + try: + run(scaffold_cmd, check=True, stdout=PIPE, stderr=STDOUT) + contigs_fasta_path = os.path.join(scaffolding_path, 'ragtag.scaffold.fasta') + except CalledProcessError as e: + print(e) + pass if haplo_args['patch']: patching_path = os.path.join(haplo_out_path, 'patching') @@ -291,9 +295,13 @@ def denovo(fastq1_path: str, contigs_fasta_path, haplo_args['ref'], '-o', patching_path, - '--nucmer_params', "'--maxmatch -l 100 -c 65'"] - run(patch_cmd, check=True, stdout=PIPE, stderr=STDOUT) - contigs_fasta_path = os.path.join(patching_path, 'ragtag.patch.fasta') + '--nucmer-params', "'--maxmatch -l 100 -c 65'"] + try: + run(patch_cmd, check=True, stdout=PIPE, stderr=STDOUT) + contigs_fasta_path = os.path.join(patching_path, 'ragtag.patch.fasta') + except CalledProcessError as e: + print(e) + pass os.chdir(start_dir) duration = datetime.now() - start_time From 75cb275aa0b0f47f57c59582150013fddb9808ef Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 11 Aug 2021 16:50:33 -0700 Subject: [PATCH 09/18] Better handle unsuccessful scaffolding or patching --- micall/core/denovo.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index a43a1e6b4..a441bbf8e 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -280,12 +280,13 @@ def denovo(fastq1_path: str, '-o', scaffolding_path, '--aligner', 'nucmer', '--nucmer-params', "'--maxmatch -l 100 -c 65'"] - try: - run(scaffold_cmd, check=True, stdout=PIPE, stderr=STDOUT) - contigs_fasta_path = os.path.join(scaffolding_path, 'ragtag.scaffold.fasta') - except CalledProcessError as e: - print(e) - pass + run(scaffold_cmd, check=True, stdout=PIPE, stderr=STDOUT) + new_contigs_fasta_path = os.path.join(scaffolding_path, 'ragtag.scaffold.fasta') + if os.path.getsize(new_contigs_fasta_path) > 0: + print('Scaffolding was successful!') + contigs_fasta_path = new_contigs_fasta_path + else: + print('Scaffolding was not successful') if haplo_args['patch']: patching_path = os.path.join(haplo_out_path, 'patching') @@ -296,12 +297,13 @@ def denovo(fastq1_path: str, haplo_args['ref'], '-o', patching_path, '--nucmer-params', "'--maxmatch -l 100 -c 65'"] - try: - run(patch_cmd, check=True, stdout=PIPE, stderr=STDOUT) - contigs_fasta_path = os.path.join(patching_path, 'ragtag.patch.fasta') - except CalledProcessError as e: - print(e) - pass + run(patch_cmd, check=True, stdout=PIPE, stderr=STDOUT) + new_contigs_fasta_path = os.path.join(patching_path, 'ragtag.patch.fasta') + if os.path.getsize(new_contigs_fasta_path) > 0: + print('Patching was successful!') + contigs_fasta_path = new_contigs_fasta_path + else: + print('Patching was not successful') os.chdir(start_dir) duration = datetime.now() - start_time From bdf5704c002b5ec33bf5f236610583e4e3d09beb Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 11 Aug 2021 17:12:08 -0700 Subject: [PATCH 10/18] Correct nucmer options --- micall/core/denovo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index a441bbf8e..883d02c22 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -279,7 +279,7 @@ def denovo(fastq1_path: str, contigs_fasta_path, '-o', scaffolding_path, '--aligner', 'nucmer', - '--nucmer-params', "'--maxmatch -l 100 -c 65'"] + '--nucmer-params', '--maxmatch -l 100 -c 65'] run(scaffold_cmd, check=True, stdout=PIPE, stderr=STDOUT) new_contigs_fasta_path = os.path.join(scaffolding_path, 'ragtag.scaffold.fasta') if os.path.getsize(new_contigs_fasta_path) > 0: @@ -296,7 +296,7 @@ def denovo(fastq1_path: str, contigs_fasta_path, haplo_args['ref'], '-o', patching_path, - '--nucmer-params', "'--maxmatch -l 100 -c 65'"] + '--nucmer-params', '--maxmatch -l 100 -c 65'] run(patch_cmd, check=True, stdout=PIPE, stderr=STDOUT) new_contigs_fasta_path = os.path.join(patching_path, 'ragtag.patch.fasta') if os.path.getsize(new_contigs_fasta_path) > 0: From 89d5c2eb419f32f85a51334af9d1a4b93677e376 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Thu, 12 Aug 2021 14:06:40 -0700 Subject: [PATCH 11/18] Add option for a second try at assembly with filtered reads --- micall/core/denovo.py | 87 +++++++++++++++++++++++++++++++++++++++++-- micall_docker.py | 7 +++- 2 files changed, 90 insertions(+), 4 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 883d02c22..cb90565e9 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -25,6 +25,7 @@ from os import fdopen, remove from iva.assembly import Assembly from pyfastaq.tasks import deinterleave +from micall.core.remap import remap, map_to_contigs HAPLOFLOW = "haploflow" RAGTAG = "/home/charlotte/Documents/Git/MiCall/.venv/bin/RagTag/ragtag.py" @@ -178,6 +179,32 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): return samples +def separate_contigs(contigs_csv, ref_contigs_csv, noref_contigs_csv): + """ Separate contigs into those that mapped to or did not map to a reference. + :param contigs_csv: file with contigs, open in read mode + :param ref_contigs_csv: file for contigs that mapped to a reference, open in write mode + :param noref_contigs_csv: file for contigs that did not map to a reference, open in write mode + """ + threshold = 0.1 + # is a match threshold sufficient or do we need info from blast_csv as well? + fieldnames = ['ref', 'match', 'group_ref', 'contig'] + ref_contig_writer = DictWriter(ref_contigs_csv, fieldnames) + ref_contig_writer.writeheader() + noref_contig_writer = DictWriter(noref_contigs_csv, fieldnames) + noref_contig_writer.writeheader() + contig_reader = DictReader(contigs_csv) + num_total = 0 + num_match = 0 + for row in contig_reader: + num_total += 1 + if float(row['match']) > threshold: + ref_contig_writer.writerow(row) + num_match += 1 + else: + noref_contig_writer.writerow(row) + return num_total - num_match + + def denovo(fastq1_path: str, fastq2_path: str, contigs_csv: typing.TextIO, @@ -220,7 +247,8 @@ def denovo(fastq1_path: str, 'merge': False, 'scaffold': False, 'patch': False, - 'ref': None} + 'ref': None, + 'RP': False} haplo_out_path = os.path.join(tmp_dir, 'haplo_out') contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') haplo_cmd = [HAPLOFLOW, @@ -231,8 +259,7 @@ def denovo(fastq1_path: str, '--strict', str(haplo_args['strict']), '--filter', str(haplo_args['filter']), '--thres', str(haplo_args['thres']), - '--long', str(haplo_args['long']), - '--debug', '1'] + '--long', str(haplo_args['long'])] try: run(haplo_cmd, check=True, stdout=PIPE, stderr=STDOUT) except CalledProcessError as ex: @@ -243,6 +270,60 @@ def denovo(fastq1_path: str, with open(contigs_fasta_path, 'a'): pass + if haplo_args['RP']: + contigs_firstpass = os.path.join(haplo_out_path, "contigs_firstpass.csv") + blast_firstpass = os.path.join(haplo_out_path, "blast_firstpass.csv") + ref_contigs = os.path.join(haplo_out_path, "ref_contigs.csv") + noref_contigs = os.path.join(haplo_out_path, "noref_contigs.csv") + with open(contigs_firstpass, 'w') as contigs_firstpass_csv, \ + open(blast_firstpass, 'w') as blast_firstpass_csv: + contig_count = write_contig_refs(contigs_fasta_path, + contigs_firstpass_csv, + blast_csv=blast_firstpass_csv) + with open(contigs_firstpass, 'r') as contigs_firstpass_csv, \ + open(ref_contigs, 'w') as ref_contigs_csv, \ + open(noref_contigs, 'w') as noref_contigs_csv: + num_noref = separate_contigs(contigs_firstpass_csv, ref_contigs_csv, noref_contigs_csv) + print(f"Assembled {contig_count} contigs in the first pass, of which {num_noref} did not map to a reference.") + unmapped1_path = os.path.join(haplo_out_path, 'firstpass_unmapped1.fastq') + unmapped2_path = os.path.join(haplo_out_path, 'firstpass_unmapped2.fastq') + if num_noref: + with open(os.path.join(haplo_out_path, 'firstpass_remap.csv'), 'w') as remap_csv, \ + open(os.path.join(haplo_out_path, 'firstpass_remap_counts.csv'), 'w') as counts_csv, \ + open(os.path.join(haplo_out_path, 'firstpass_remap_conseq.csv'), 'w') as conseq_csv, \ + open(unmapped1_path, 'w') as unmapped1, \ + open(unmapped2_path, 'w') as unmapped2, \ + open(noref_contigs, 'r') as noref_contigs_csv: + map_to_contigs(fastq1_path, + fastq2_path, + noref_contigs_csv, + remap_csv, + counts_csv, + conseq_csv, + unmapped1, + unmapped2, + haplo_out_path, ) + # we want to use the reads that did not map to the contigs that did not blast to the refs + filtered_joined_path = os.path.join(haplo_out_path, 'filtered_joined.fastq') + run(['merge-mates', + unmapped1_path, + unmapped2_path, + '--interleave', + '-o', filtered_joined_path], + check=True) + haplo_out_path = os.path.join(tmp_dir, 'haplo_secondpass_out') + contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') + haplo_cmd = [HAPLOFLOW, + '--read-file', filtered_joined_path, + '--out', haplo_out_path, + '--k', str(haplo_args['kmer']), + '--error-rate', str(haplo_args['error']), + '--strict', str(haplo_args['strict']), + '--filter', str(haplo_args['filter']), + '--thres', str(haplo_args['thres']), + '--long', str(haplo_args['long'])] + run(haplo_cmd, check=True, stdout=PIPE, stderr=STDOUT) + if haplo_args['merge']: fh, abs_path = mkstemp() i = 0 diff --git a/micall_docker.py b/micall_docker.py index 1ba0cd594..9ed7fa4cb 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -390,6 +390,10 @@ def get_parser(default_max_active): type=str, default=None, ) + command_parser.add_argument( + "-RP", + action='store_true', + ) return parser @@ -999,7 +1003,8 @@ def process_sample(sample, args, pssm, use_denovo=False): 'merge':args.denovo_merge, 'scaffold': args.scaffold, 'patch': args.patch, - 'ref': args.ref} + 'ref': args.ref, + 'RP': args.RP} try: excluded_seeds = [] if args.all_projects else EXCLUDED_SEEDS excluded_projects = [] if args.all_projects else EXCLUDED_PROJECTS From 7b75e640593265f164bdcfad6ac66fa9bb52f6f8 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 18 Aug 2021 18:04:23 -0700 Subject: [PATCH 12/18] Use information from remap.csv to separate reads --- micall/core/denovo.py | 52 +++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index cb90565e9..d6f85d4db 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -205,6 +205,32 @@ def separate_contigs(contigs_csv, ref_contigs_csv, noref_contigs_csv): return num_total - num_match +def separate_reads(remap_csv, ref_reads_file, noref_reads_file, unmapped1, unmapped2): + """ Separate reads from remap.csv file into those that mapped to un unknown partial and the rest. + + :param remap_csv: remap output file created by map_to_contigs, open in read mode + :param ref_reads_file: file to write potentially useful reads (that mapped to useful contigs or that did not map) + :param noref_reads_file: file to write useless reads (that mapped to unknown contig) + :param unmapped1: fasta file 1 of reads that did not map + :param unmapped2: fasta file 2 of reads that did not map + """ + fieldnames = ['qname', 'flag', 'rname', 'pos', 'mapq', 'cigar', 'rnext', 'pnext', 'tlen', 'seq', 'qual'] + remap_reader = DictReader(remap_csv) + for row in remap_reader: + if row['rname'][-16:] == "-unknown-partial": + file_to_write = noref_reads_file + else: + file_to_write = ref_reads_file + file_to_write.write('@'+row['qname']+'\n') + file_to_write.write(row['seq']+'\n') + file_to_write.write('+\n') + file_to_write.write(row['qual']+'\n') + for line in unmapped1: + ref_reads_file.write(line) + for line in unmapped2: + ref_reads_file.write(line) + + def denovo(fastq1_path: str, fastq2_path: str, contigs_csv: typing.TextIO, @@ -287,34 +313,36 @@ def denovo(fastq1_path: str, print(f"Assembled {contig_count} contigs in the first pass, of which {num_noref} did not map to a reference.") unmapped1_path = os.path.join(haplo_out_path, 'firstpass_unmapped1.fastq') unmapped2_path = os.path.join(haplo_out_path, 'firstpass_unmapped2.fastq') + remap_path = os.path.join(haplo_out_path, 'firstpass_remap.csv') if num_noref: - with open(os.path.join(haplo_out_path, 'firstpass_remap.csv'), 'w') as remap_csv, \ + with open(remap_path, 'w') as remap_csv, \ open(os.path.join(haplo_out_path, 'firstpass_remap_counts.csv'), 'w') as counts_csv, \ open(os.path.join(haplo_out_path, 'firstpass_remap_conseq.csv'), 'w') as conseq_csv, \ open(unmapped1_path, 'w') as unmapped1, \ open(unmapped2_path, 'w') as unmapped2, \ - open(noref_contigs, 'r') as noref_contigs_csv: + open(contigs_firstpass, 'r') as contigs_csv: map_to_contigs(fastq1_path, fastq2_path, - noref_contigs_csv, + contigs_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, haplo_out_path, ) - # we want to use the reads that did not map to the contigs that did not blast to the refs - filtered_joined_path = os.path.join(haplo_out_path, 'filtered_joined.fastq') - run(['merge-mates', - unmapped1_path, - unmapped2_path, - '--interleave', - '-o', filtered_joined_path], - check=True) + # we want to discard the reads that mapped to the contigs that did not blast to the refs + ref_reads_path = os.path.join(haplo_out_path, 'ref_reads.fasta') + noref_reads_path = os.path.join(haplo_out_path, 'noref_reads.fasta') + with open(remap_path, 'r') as remap_csv, \ + open(ref_reads_path, 'w') as ref_reads_file, \ + open(noref_reads_path, 'w') as noref_reads_file, \ + open(unmapped1_path, 'r') as unmapped1, \ + open(unmapped2_path, 'r') as unmapped2: + separate_reads(remap_csv, ref_reads_file, noref_reads_file, unmapped1, unmapped2) haplo_out_path = os.path.join(tmp_dir, 'haplo_secondpass_out') contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') haplo_cmd = [HAPLOFLOW, - '--read-file', filtered_joined_path, + '--read-file', ref_reads_path, '--out', haplo_out_path, '--k', str(haplo_args['kmer']), '--error-rate', str(haplo_args['error']), From cb5949b1b20c53c774606c8315f38ab22b61c4c3 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 18 Aug 2021 20:29:15 -0700 Subject: [PATCH 13/18] Correct filename --- micall/core/denovo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index d6f85d4db..50d80696c 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -320,10 +320,10 @@ def denovo(fastq1_path: str, open(os.path.join(haplo_out_path, 'firstpass_remap_conseq.csv'), 'w') as conseq_csv, \ open(unmapped1_path, 'w') as unmapped1, \ open(unmapped2_path, 'w') as unmapped2, \ - open(contigs_firstpass, 'r') as contigs_csv: + open(contigs_firstpass, 'r') as contigs_firstpass_csv: map_to_contigs(fastq1_path, fastq2_path, - contigs_csv, + contigs_firstpass_csv, remap_csv, counts_csv, conseq_csv, From adf13f59d58f403be9f9a52f8a45f0eaf35ea2c1 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 1 Sep 2021 11:47:06 -0700 Subject: [PATCH 14/18] Add option for IVA assembly and modify scaffolding parameters --- micall/core/denovo.py | 109 ++++++++++++++++++++++++++---------------- micall_docker.py | 7 ++- 2 files changed, 74 insertions(+), 42 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 50d80696c..643f727cc 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -28,6 +28,7 @@ from micall.core.remap import remap, map_to_contigs HAPLOFLOW = "haploflow" +IVA = "iva" RAGTAG = "/home/charlotte/Documents/Git/MiCall/.venv/bin/RagTag/ragtag.py" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), '..', @@ -274,33 +275,59 @@ def denovo(fastq1_path: str, 'scaffold': False, 'patch': False, 'ref': None, - 'RP': False} - haplo_out_path = os.path.join(tmp_dir, 'haplo_out') - contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') - haplo_cmd = [HAPLOFLOW, - '--read-file', joined_path, - '--out', haplo_out_path, - '--k', str(haplo_args['kmer']), - '--error-rate', str(haplo_args['error']), - '--strict', str(haplo_args['strict']), - '--filter', str(haplo_args['filter']), - '--thres', str(haplo_args['thres']), - '--long', str(haplo_args['long'])] - try: - run(haplo_cmd, check=True, stdout=PIPE, stderr=STDOUT) - except CalledProcessError as ex: - output = ex.output and ex.output.decode('UTF8') - if output != 'Failed to make first seed. Cannot continue\n': - logger.warning('Haploflow failed to assemble.', exc_info=True) - logger.warning(output) - with open(contigs_fasta_path, 'a'): - pass + 'RP': False, + 'IVA': False} + if not haplo_args['IVA']: + assembly_out_path = os.path.join(tmp_dir, 'haplo_out') + contigs_fasta_path = os.path.join(assembly_out_path, 'contigs.fa') + haplo_cmd = [HAPLOFLOW, + '--read-file', joined_path, + '--out', assembly_out_path, + '--k', str(haplo_args['kmer']), + '--error-rate', str(haplo_args['error']), + '--strict', str(haplo_args['strict']), + '--filter', str(haplo_args['filter']), + '--thres', str(haplo_args['thres']), + '--long', str(haplo_args['long'])] + try: + run(haplo_cmd, check=True, stdout=PIPE, stderr=STDOUT) + except CalledProcessError as ex: + output = ex.output and ex.output.decode('UTF8') + if output != 'Failed to make first seed. Cannot continue\n': + logger.warning('Haploflow failed to assemble.', exc_info=True) + logger.warning(output) + with open(contigs_fasta_path, 'a'): + pass + else: + assembly_out_path = os.path.join(tmp_dir, 'iva_out') + contigs_fasta_path = os.path.join(assembly_out_path, 'contigs.fasta') + iva_args = [IVA, '--fr', joined_path, '-t', '2'] + if merged_contigs_csv is not None: + seeds_fasta_path = os.path.join(tmp_dir, 'seeds.fasta') + with open(seeds_fasta_path, 'w') as seeds_fasta: + SeqIO.write((SeqRecord(Seq(row['contig']), f'seed-{i}', '', '') + for i, row in enumerate(DictReader(merged_contigs_csv))), + seeds_fasta, + 'fasta') + seeds_size = seeds_fasta.tell() + if seeds_size > 0: + iva_args.extend(['--contigs', seeds_fasta_path, '--make_new_seeds']) + iva_args.append(assembly_out_path) + try: + run(iva_args, check=True, stdout=PIPE, stderr=STDOUT) + except CalledProcessError as ex: + output = ex.output and ex.output.decode('UTF8') + if output != 'Failed to make first seed. Cannot continue\n': + logger.warning('iva failed to assemble.', exc_info=True) + logger.warning(output) + with open(contigs_fasta_path, 'a'): + pass if haplo_args['RP']: - contigs_firstpass = os.path.join(haplo_out_path, "contigs_firstpass.csv") - blast_firstpass = os.path.join(haplo_out_path, "blast_firstpass.csv") - ref_contigs = os.path.join(haplo_out_path, "ref_contigs.csv") - noref_contigs = os.path.join(haplo_out_path, "noref_contigs.csv") + contigs_firstpass = os.path.join(assembly_out_path, "contigs_firstpass.csv") + blast_firstpass = os.path.join(assembly_out_path, "blast_firstpass.csv") + ref_contigs = os.path.join(assembly_out_path, "ref_contigs.csv") + noref_contigs = os.path.join(assembly_out_path, "noref_contigs.csv") with open(contigs_firstpass, 'w') as contigs_firstpass_csv, \ open(blast_firstpass, 'w') as blast_firstpass_csv: contig_count = write_contig_refs(contigs_fasta_path, @@ -311,13 +338,13 @@ def denovo(fastq1_path: str, open(noref_contigs, 'w') as noref_contigs_csv: num_noref = separate_contigs(contigs_firstpass_csv, ref_contigs_csv, noref_contigs_csv) print(f"Assembled {contig_count} contigs in the first pass, of which {num_noref} did not map to a reference.") - unmapped1_path = os.path.join(haplo_out_path, 'firstpass_unmapped1.fastq') - unmapped2_path = os.path.join(haplo_out_path, 'firstpass_unmapped2.fastq') - remap_path = os.path.join(haplo_out_path, 'firstpass_remap.csv') + unmapped1_path = os.path.join(assembly_out_path, 'firstpass_unmapped1.fastq') + unmapped2_path = os.path.join(assembly_out_path, 'firstpass_unmapped2.fastq') + remap_path = os.path.join(assembly_out_path, 'firstpass_remap.csv') if num_noref: with open(remap_path, 'w') as remap_csv, \ - open(os.path.join(haplo_out_path, 'firstpass_remap_counts.csv'), 'w') as counts_csv, \ - open(os.path.join(haplo_out_path, 'firstpass_remap_conseq.csv'), 'w') as conseq_csv, \ + open(os.path.join(assembly_out_path, 'firstpass_remap_counts.csv'), 'w') as counts_csv, \ + open(os.path.join(assembly_out_path, 'firstpass_remap_conseq.csv'), 'w') as conseq_csv, \ open(unmapped1_path, 'w') as unmapped1, \ open(unmapped2_path, 'w') as unmapped2, \ open(contigs_firstpass, 'r') as contigs_firstpass_csv: @@ -329,21 +356,21 @@ def denovo(fastq1_path: str, conseq_csv, unmapped1, unmapped2, - haplo_out_path, ) + assembly_out_path, ) # we want to discard the reads that mapped to the contigs that did not blast to the refs - ref_reads_path = os.path.join(haplo_out_path, 'ref_reads.fasta') - noref_reads_path = os.path.join(haplo_out_path, 'noref_reads.fasta') + ref_reads_path = os.path.join(assembly_out_path, 'ref_reads.fasta') + noref_reads_path = os.path.join(assembly_out_path, 'noref_reads.fasta') with open(remap_path, 'r') as remap_csv, \ open(ref_reads_path, 'w') as ref_reads_file, \ open(noref_reads_path, 'w') as noref_reads_file, \ open(unmapped1_path, 'r') as unmapped1, \ open(unmapped2_path, 'r') as unmapped2: separate_reads(remap_csv, ref_reads_file, noref_reads_file, unmapped1, unmapped2) - haplo_out_path = os.path.join(tmp_dir, 'haplo_secondpass_out') - contigs_fasta_path = os.path.join(haplo_out_path, 'contigs.fa') + assembly_out_path = os.path.join(tmp_dir, 'haplo_secondpass_out') + contigs_fasta_path = os.path.join(assembly_out_path, 'contigs.fa') haplo_cmd = [HAPLOFLOW, '--read-file', ref_reads_path, - '--out', haplo_out_path, + '--out', assembly_out_path, '--k', str(haplo_args['kmer']), '--error-rate', str(haplo_args['error']), '--strict', str(haplo_args['strict']), @@ -376,11 +403,11 @@ def denovo(fastq1_path: str, haplo_assembly._trim_strand_biased_ends(reads_prefix, tag_as_trimmed=True) haplo_assembly._remove_contained_contigs(list(haplo_assembly.contigs.keys())) haplo_assembly._merge_overlapping_contigs(list(haplo_assembly.contigs.keys())) - contigs_fasta_path = os.path.join(haplo_out_path, 'contigs_merged.fasta') + contigs_fasta_path = os.path.join(assembly_out_path, 'contigs_merged.fasta') haplo_assembly.write_contigs_to_file(contigs_fasta_path) if haplo_args['scaffold']: - scaffolding_path = os.path.join(haplo_out_path, 'scaffolding') + scaffolding_path = os.path.join(assembly_out_path, 'scaffolding') scaffold_cmd = ['python3.8', RAGTAG, 'scaffold', @@ -388,7 +415,7 @@ def denovo(fastq1_path: str, contigs_fasta_path, '-o', scaffolding_path, '--aligner', 'nucmer', - '--nucmer-params', '--maxmatch -l 100 -c 65'] + '--nucmer-params', '--maxmatch -l 30 -c 20'] run(scaffold_cmd, check=True, stdout=PIPE, stderr=STDOUT) new_contigs_fasta_path = os.path.join(scaffolding_path, 'ragtag.scaffold.fasta') if os.path.getsize(new_contigs_fasta_path) > 0: @@ -398,14 +425,14 @@ def denovo(fastq1_path: str, print('Scaffolding was not successful') if haplo_args['patch']: - patching_path = os.path.join(haplo_out_path, 'patching') + patching_path = os.path.join(assembly_out_path, 'patching') patch_cmd = ['python3.8', RAGTAG, 'patch', contigs_fasta_path, haplo_args['ref'], '-o', patching_path, - '--nucmer-params', '--maxmatch -l 100 -c 65'] + '--nucmer-params', '--maxmatch -l 30 -c 20'] run(patch_cmd, check=True, stdout=PIPE, stderr=STDOUT) new_contigs_fasta_path = os.path.join(patching_path, 'ragtag.patch.fasta') if os.path.getsize(new_contigs_fasta_path) > 0: diff --git a/micall_docker.py b/micall_docker.py index f33ea7c6c..2b0c3fae0 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -394,6 +394,10 @@ def get_parser(default_max_active): "-RP", action='store_true', ) + command_parser.add_argument( + "-IVA", + action='store_true', + ) return parser @@ -1004,7 +1008,8 @@ def process_sample(sample, args, pssm, use_denovo=False): 'scaffold': args.scaffold, 'patch': args.patch, 'ref': args.ref, - 'RP': args.RP} + 'RP': args.RP, + 'IVA': args.IVA} try: excluded_seeds = [] if args.all_projects else EXCLUDED_SEEDS excluded_projects = [] if args.all_projects else EXCLUDED_PROJECTS From 138dcb20fc9bbece819bb761d091016c3e26ac46 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 18 Sep 2023 17:51:23 -0700 Subject: [PATCH 15/18] Install Haploflow during docker initialization --- Dockerfile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Dockerfile b/Dockerfile index 2a0c7ea41..cc4729cca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -83,6 +83,13 @@ RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \ tar -xzf smalt-0.7.6-bin.tar.gz --no-same-owner && \ ln -s /opt/smalt-0.7.6-bin/smalt_x86_64 /bin/smalt +## Installing Haploflow +RUN apt-get install -y build-essential sudo git ronn \ + cd /opt/ && + git clone https://github.com/hzi-bifo/Haploflow \ + cd /opt/Haploflow && sh build.sh \ + ln -s /opt/Haploflow/build/haploflow /bin/haploflow + ## Install dependencies for genetracks/drawsvg RUN apt-get install -q -y libcairo2-dev From c621522561d0e3b97ddfdd663db170db552d52a0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 18 Sep 2023 17:59:39 -0700 Subject: [PATCH 16/18] Install Haploflow on CI --- .github/workflows/build-and-test.yml | 9 +++++++++ Dockerfile | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index de5ebe481..edd1309f9 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -16,6 +16,15 @@ jobs: with: go-version: '^1.15.6' + - name: Install Haploflow + run: | + sudo apt-get update + sudo apt-get install -y build-essential git ronn + cd /opt/ + git clone https://github.com/hzi-bifo/Haploflow + cd /opt/Haploflow && sh build.sh + sudo ln -s /opt/Haploflow/build/haploflow ~/bin/haploflow + - name: Install IVA assembler dependencies run: | sudo apt-get install -qq zlib1g-dev libncurses5-dev libncursesw5-dev mummer ncbi-blast+ diff --git a/Dockerfile b/Dockerfile index cc4729cca..ca8c41ee3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -83,7 +83,7 @@ RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \ tar -xzf smalt-0.7.6-bin.tar.gz --no-same-owner && \ ln -s /opt/smalt-0.7.6-bin/smalt_x86_64 /bin/smalt -## Installing Haploflow +## Install Haploflow RUN apt-get install -y build-essential sudo git ronn \ cd /opt/ && git clone https://github.com/hzi-bifo/Haploflow \ From 272635dd822af6da69279518cc3401ba05e06f35 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 19 Sep 2023 08:45:16 -0700 Subject: [PATCH 17/18] Pin exact Haploflow version Note that I am using a specific commit version for pinning. Rationalization: Haploflow is not developed actively, and because of that, the last release is quite far behind master. Thus, I think it's better to pin its version to the current master than to the latest release tag. --- .github/workflows/build-and-test.yml | 4 +++- Dockerfile | 11 +++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index edd1309f9..62fee2da4 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -22,7 +22,9 @@ jobs: sudo apt-get install -y build-essential git ronn cd /opt/ git clone https://github.com/hzi-bifo/Haploflow - cd /opt/Haploflow && sh build.sh + cd Haploflow + git checkout 9a5a0ff6c3a0435e723e41f98fe82ec2ad19cf50 + sh build.sh sudo ln -s /opt/Haploflow/build/haploflow ~/bin/haploflow - name: Install IVA assembler dependencies diff --git a/Dockerfile b/Dockerfile index ca8c41ee3..181f61910 100644 --- a/Dockerfile +++ b/Dockerfile @@ -84,10 +84,13 @@ RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \ ln -s /opt/smalt-0.7.6-bin/smalt_x86_64 /bin/smalt ## Install Haploflow -RUN apt-get install -y build-essential sudo git ronn \ - cd /opt/ && - git clone https://github.com/hzi-bifo/Haploflow \ - cd /opt/Haploflow && sh build.sh \ +RUN apt-get update && \ + apt-get install -y build-essential sudo git ronn cmake && \ + cd /opt/ && \ + git clone https://github.com/hzi-bifo/Haploflow && \ + cd Haploflow && \ + git checkout 9a5a0ff6c3a0435e723e41f98fe82ec2ad19cf50 && \ + yes | sh build.sh && \ ln -s /opt/Haploflow/build/haploflow /bin/haploflow ## Install dependencies for genetracks/drawsvg From 5ecb109a9e1cd9613168efb803fb91edd1e0a38e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 20 Sep 2023 13:54:30 -0700 Subject: [PATCH 18/18] Switch to Debian in Singularity image --- Singularity | 79 +++++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 54 deletions(-) diff --git a/Singularity b/Singularity index e32803957..b59394534 100644 --- a/Singularity +++ b/Singularity @@ -1,6 +1,6 @@ # Generate the Singularity container to run MiCall on Kive. Bootstrap: docker -From: centos:7 +From: python:3.8 %help MiCall maps all the reads from a sample against a set of reference @@ -53,48 +53,28 @@ From: centos:7 %post echo ===== Installing Prerequisites ===== >/dev/null - yum update -q -y - - yum groupinstall -q -y 'development tools' - yum install -q -y epel-release - yum install -q -y unzip wget fontconfig bzip2-devel xz-devel openssl-devel \ - libffi-devel sqlite-devel - - echo ===== Installing Python ===== >/dev/null - wget -q https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tar.xz - tar xJf Python* - rm Python*.xz - cd Python* - ./configure --enable-optimizations - make altinstall - cd .. - rm -rf Python* - ln -s /usr/local/bin/python3.8 /usr/local/bin/python3 + apt-get update -q + apt-get install -q -y unzip wget echo ===== Installing blast ===== >/dev/null - cd /root - # Saved our own copy, because download was slow from ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-1.x86_64.rpm - wget -q https://github.com/cfe-lab/MiCall/releases/download/v7.12.dev28/ncbi-blast-2.6.0+-1.x86_64.rpm - yum install -q -y ncbi-blast-2.6.0+-1.x86_64.rpm - rm ncbi-blast-2.6.0+-1.x86_64.rpm - python3 /opt/micall/micall/blast_db/make_blast_db.py + apt-get install -q -y ncbi-blast+ echo ===== Installing Rust and merge-mates ===== >/dev/null - yum install -q -y rust cargo + wget -qO rustup.sh https://sh.rustup.rs + chmod +x /rustup.sh + /rustup.sh -y -q + . /root/.cargo/env + rm rustup.sh cargo install --quiet --root / --git https://github.com/jeff-k/merge-mates.git --rev 2fec61363f645e2008a4adff553d098beae21469 - ## Miniconda (Python 2) (Don't use this) - #wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh - #bash miniconda.sh -b -p /opt/miniconda - echo ===== Installing bowtie2 ===== >/dev/null wget -q -O bowtie2.zip https://github.com/BenLangmead/bowtie2/releases/download/v2.2.8/bowtie2-2.2.8-linux-x86_64.zip - unzip -qq bowtie2.zip -d /opt/ + unzip bowtie2.zip -d /opt/ ln -s /opt/bowtie2-2.2.8/ /opt/bowtie2 rm bowtie2.zip echo ===== Installing IVA dependencies ===== >/dev/null - yum install -q -y tcsh ncurses-devel zlib-devel + apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev cd /bin wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc_dump @@ -122,32 +102,23 @@ From: centos:7 tar -xzf smalt-0.7.6-bin.tar.gz --no-same-owner ln -s /opt/smalt-0.7.6-bin/smalt_x86_64 /bin/smalt + echo ===== Installing Haploflow ===== >/dev/null + apt-get install -q -y libboost-all-dev build-essential sudo git ronn cmake + cd /opt/ + git clone https://github.com/hzi-bifo/Haploflow + cd Haploflow + git checkout 9a5a0ff6c3a0435e723e41f98fe82ec2ad19cf50 + yes | sh build.sh + echo ===== Installing Python packages ===== >/dev/null + # Install dependencies for genetracks/drawsvg + apt-get install -q -y libcairo2-dev # Also trigger matplotlib to build its font cache. - wget -q https://bootstrap.pypa.io/get-pip.py - python3 get-pip.py - rm get-pip.py cd /opt - pip install --quiet -r /opt/micall/requirements.txt - ln -s /usr/local/bin/cutadapt /usr/local/bin/cutadapt-1.11 - python3 -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' - - # Install dependencies for genetracks/drawsvg - yum install -q -y cairo-devel cairo cairo-tools zlib-devel - - yum groupremove -q -y 'development tools' - yum remove -q -y epel-release wget unzip - yum autoremove -q -y - yum clean all - - rm -rf /var/cache/yum - - ## CAUTION! This changes the default python command to python3! - ## This breaks many things, including yum! - ## To switch back to python2, use this command: - # sudo alternatives --set python /usr/bin/python2 - alternatives --install /usr/bin/python python /usr/bin/python2 50 - alternatives --install /usr/bin/python python /usr/local/bin/python3 60 + pip install --upgrade pip + pip install -r /opt/micall/requirements-basespace.txt + python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' + python /opt/micall/micall/blast_db/make_blast_db.py %environment export PATH=/opt/bowtie2:/bin:/usr/local/bin