From eef81ca1867a1f5ff5fec87d3f020a3dec3be363 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Fri, 11 Aug 2023 21:44:28 +0200 Subject: [PATCH] add viloca vary alpha --- .../config_varyvilocaparams/config.yaml | 5 +- .../config_varyvilocaparams/params.csv | 4 +- .../viloca_alpha_0.0000001.py | 130 ++++++++++++++++++ .../viloca_alpha_0.000001.py | 130 ++++++++++++++++++ .../viloca_alpha_0.00001.py | 130 ++++++++++++++++++ .../method_definitions/viloca_alpha_0.0001.py | 130 ++++++++++++++++++ 6 files changed, 527 insertions(+), 2 deletions(-) create mode 100644 resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.0000001.py create mode 100644 resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.000001.py create mode 100644 resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py create mode 100644 resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.0001.py diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_varyvilocaparams/config.yaml b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_varyvilocaparams/config.yaml index 6bb561d4..415ec36a 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_varyvilocaparams/config.yaml +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_varyvilocaparams/config.yaml @@ -1,5 +1,8 @@ method_list: [ - viloca_alpha, + viloca_alpha_0.0001.py, + viloca_alpha_0.00001.py, + viloca_alpha_0.000001.py, + viloca_alpha_0.0000001.py, #viloca_envp, #viloca_multi, #viloca_envp_multi, diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_varyvilocaparams/params.csv b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_varyvilocaparams/params.csv index 53182702..d9cc8818 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_varyvilocaparams/params.csv +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_varyvilocaparams/params.csv @@ -4,6 +4,8 @@ illumina,shotgun,single_amplicon,249,249,1000,5@5@30@10@15@geom@0.75 # Population 5 illumina,shotgun,single_amplicon,249,249,1000,5@15@30@10@15@geom@0.75 # Population 8 (ONT) -nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@6500@240@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@600@240@240@geom@0.75 # Population 10 (ONT) nanopore,shotgun,single_amplicon,5000,5000,1000,15@5@600@200@200@geom@0.75 +# amplicon mode +#illumina,amplicon,400:100,249,9719,1000,5@5@1200@480@480@geom@0.75 diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.0000001.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.0000001.py new file mode 100644 index 00000000..f324619e --- /dev/null +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.0000001.py @@ -0,0 +1,130 @@ +# GROUP: global +# CONDA: libshorah +# CONDA: biopython = 1.79 +# PIP: git+https://github.com/LaraFuhrmann/shorah@master + +import subprocess +from pathlib import Path +from os import listdir +from os.path import isfile, join +from Bio import SeqIO +import gzip + +def gunzip(source_filepath, dest_filepath, block_size=65536): + with gzip.open(source_filepath, 'rb') as s_file, \ + open(dest_filepath, 'wb') as d_file: + while True: + block = s_file.read(block_size) + if not block: + break + else: + d_file.write(block) + +def main(fname_bam, fname_reference,fname_insert_bed, fname_results_snv, fname_result_haplos, dname_work): + + genome_size = str(fname_bam).split('genome_size~')[1].split('__coverage')[0] + alpha = 0.0000001 + n_max_haplotypes = 500 + n_mfa_starts = 1 + win_min_ext = 0.85 + + read_length = str(fname_bam).split('read_length~')[1].split('__')[0] + if read_length == "Ten_strain_IAV": + sampler = "learn_error_params" + win_min_ext = 0.5 + else: + sampler = "use_quality_scores" + + dname_work.mkdir(parents=True, exist_ok=True) + if fname_insert_bed == []: + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + ], + cwd=dname_work, + ) + else: + # insert bed file is there + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "-z", + Path(fname_insert_bed).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + ], + cwd=dname_work, + ) + + (dname_work / "snv" / "SNVs_0.010000_final.vcf").rename(fname_results_snv) + + + mypath = (dname_work /'support').resolve() + onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] + print('onlyfiles', onlyfiles) + for file in onlyfiles: + if 'reads-support.fas' in file: + file_name = onlyfiles[0] + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + if file.endswith('.gz'): + fname_zipped = (dname_work / "support" / onlyfiles[0]).resolve() + fname_haplos = onlyfiles[0].split('.gz')[0] + fname_unzipped = (dname_work / "support" / fname_haplos).resolve() + # unzip + gunzip(fname_zipped, fname_result_haplos) + + elif file.endswith('.fas'): + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + (dname_work / "support" / file).rename(fname_result_haplos) + + # fix frequency information + + freq_list = [] + for record in SeqIO.parse(fname_result_haplos, "fasta"): + freq_list.append(float(record.description.split('ave_reads=')[-1])) + norm_freq_list = [float(i)/sum(freq_list) for i in freq_list] + + record_list = [] + for idx, record in enumerate(SeqIO.parse(fname_result_haplos, "fasta")): + record.description = f"freq:{norm_freq_list[idx]}" + record_list.append(record) + SeqIO.write(record_list, fname_result_haplos, "fasta") + + +if __name__ == "__main__": + main( + Path(snakemake.input.fname_bam), + Path(snakemake.input.fname_reference), + snakemake.input.fname_insert_bed, + Path(snakemake.output.fname_result), + Path(snakemake.output.fname_result_haplos), + Path(snakemake.output.dname_work), + ) diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.000001.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.000001.py new file mode 100644 index 00000000..76bff69a --- /dev/null +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.000001.py @@ -0,0 +1,130 @@ +# GROUP: global +# CONDA: libshorah +# CONDA: biopython = 1.79 +# PIP: git+https://github.com/LaraFuhrmann/shorah@master + +import subprocess +from pathlib import Path +from os import listdir +from os.path import isfile, join +from Bio import SeqIO +import gzip + +def gunzip(source_filepath, dest_filepath, block_size=65536): + with gzip.open(source_filepath, 'rb') as s_file, \ + open(dest_filepath, 'wb') as d_file: + while True: + block = s_file.read(block_size) + if not block: + break + else: + d_file.write(block) + +def main(fname_bam, fname_reference,fname_insert_bed, fname_results_snv, fname_result_haplos, dname_work): + + genome_size = str(fname_bam).split('genome_size~')[1].split('__coverage')[0] + alpha = 0.000001 + n_max_haplotypes = 500 + n_mfa_starts = 1 + win_min_ext = 0.85 + + read_length = str(fname_bam).split('read_length~')[1].split('__')[0] + if read_length == "Ten_strain_IAV": + sampler = "learn_error_params" + win_min_ext = 0.5 + else: + sampler = "use_quality_scores" + + dname_work.mkdir(parents=True, exist_ok=True) + if fname_insert_bed == []: + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + ], + cwd=dname_work, + ) + else: + # insert bed file is there + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "-z", + Path(fname_insert_bed).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + ], + cwd=dname_work, + ) + + (dname_work / "snv" / "SNVs_0.010000_final.vcf").rename(fname_results_snv) + + + mypath = (dname_work /'support').resolve() + onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] + print('onlyfiles', onlyfiles) + for file in onlyfiles: + if 'reads-support.fas' in file: + file_name = onlyfiles[0] + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + if file.endswith('.gz'): + fname_zipped = (dname_work / "support" / onlyfiles[0]).resolve() + fname_haplos = onlyfiles[0].split('.gz')[0] + fname_unzipped = (dname_work / "support" / fname_haplos).resolve() + # unzip + gunzip(fname_zipped, fname_result_haplos) + + elif file.endswith('.fas'): + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + (dname_work / "support" / file).rename(fname_result_haplos) + + # fix frequency information + + freq_list = [] + for record in SeqIO.parse(fname_result_haplos, "fasta"): + freq_list.append(float(record.description.split('ave_reads=')[-1])) + norm_freq_list = [float(i)/sum(freq_list) for i in freq_list] + + record_list = [] + for idx, record in enumerate(SeqIO.parse(fname_result_haplos, "fasta")): + record.description = f"freq:{norm_freq_list[idx]}" + record_list.append(record) + SeqIO.write(record_list, fname_result_haplos, "fasta") + + +if __name__ == "__main__": + main( + Path(snakemake.input.fname_bam), + Path(snakemake.input.fname_reference), + snakemake.input.fname_insert_bed, + Path(snakemake.output.fname_result), + Path(snakemake.output.fname_result_haplos), + Path(snakemake.output.dname_work), + ) diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py new file mode 100644 index 00000000..0a1da654 --- /dev/null +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py @@ -0,0 +1,130 @@ +# GROUP: global +# CONDA: libshorah +# CONDA: biopython = 1.79 +# PIP: git+https://github.com/LaraFuhrmann/shorah@master + +import subprocess +from pathlib import Path +from os import listdir +from os.path import isfile, join +from Bio import SeqIO +import gzip + +def gunzip(source_filepath, dest_filepath, block_size=65536): + with gzip.open(source_filepath, 'rb') as s_file, \ + open(dest_filepath, 'wb') as d_file: + while True: + block = s_file.read(block_size) + if not block: + break + else: + d_file.write(block) + +def main(fname_bam, fname_reference,fname_insert_bed, fname_results_snv, fname_result_haplos, dname_work): + + genome_size = str(fname_bam).split('genome_size~')[1].split('__coverage')[0] + alpha = 0.00001 + n_max_haplotypes = 500 + n_mfa_starts = 1 + win_min_ext = 0.85 + + read_length = str(fname_bam).split('read_length~')[1].split('__')[0] + if read_length == "Ten_strain_IAV": + sampler = "learn_error_params" + win_min_ext = 0.5 + else: + sampler = "use_quality_scores" + + dname_work.mkdir(parents=True, exist_ok=True) + if fname_insert_bed == []: + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + ], + cwd=dname_work, + ) + else: + # insert bed file is there + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "-z", + Path(fname_insert_bed).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + ], + cwd=dname_work, + ) + + (dname_work / "snv" / "SNVs_0.010000_final.vcf").rename(fname_results_snv) + + + mypath = (dname_work /'support').resolve() + onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] + print('onlyfiles', onlyfiles) + for file in onlyfiles: + if 'reads-support.fas' in file: + file_name = onlyfiles[0] + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + if file.endswith('.gz'): + fname_zipped = (dname_work / "support" / onlyfiles[0]).resolve() + fname_haplos = onlyfiles[0].split('.gz')[0] + fname_unzipped = (dname_work / "support" / fname_haplos).resolve() + # unzip + gunzip(fname_zipped, fname_result_haplos) + + elif file.endswith('.fas'): + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + (dname_work / "support" / file).rename(fname_result_haplos) + + # fix frequency information + + freq_list = [] + for record in SeqIO.parse(fname_result_haplos, "fasta"): + freq_list.append(float(record.description.split('ave_reads=')[-1])) + norm_freq_list = [float(i)/sum(freq_list) for i in freq_list] + + record_list = [] + for idx, record in enumerate(SeqIO.parse(fname_result_haplos, "fasta")): + record.description = f"freq:{norm_freq_list[idx]}" + record_list.append(record) + SeqIO.write(record_list, fname_result_haplos, "fasta") + + +if __name__ == "__main__": + main( + Path(snakemake.input.fname_bam), + Path(snakemake.input.fname_reference), + snakemake.input.fname_insert_bed, + Path(snakemake.output.fname_result), + Path(snakemake.output.fname_result_haplos), + Path(snakemake.output.dname_work), + ) diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.0001.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.0001.py new file mode 100644 index 00000000..4dccab6d --- /dev/null +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.0001.py @@ -0,0 +1,130 @@ +# GROUP: global +# CONDA: libshorah +# CONDA: biopython = 1.79 +# PIP: git+https://github.com/LaraFuhrmann/shorah@master + +import subprocess +from pathlib import Path +from os import listdir +from os.path import isfile, join +from Bio import SeqIO +import gzip + +def gunzip(source_filepath, dest_filepath, block_size=65536): + with gzip.open(source_filepath, 'rb') as s_file, \ + open(dest_filepath, 'wb') as d_file: + while True: + block = s_file.read(block_size) + if not block: + break + else: + d_file.write(block) + +def main(fname_bam, fname_reference,fname_insert_bed, fname_results_snv, fname_result_haplos, dname_work): + + genome_size = str(fname_bam).split('genome_size~')[1].split('__coverage')[0] + alpha = 0.0001 + n_max_haplotypes = 500 + n_mfa_starts = 1 + win_min_ext = 0.85 + + read_length = str(fname_bam).split('read_length~')[1].split('__')[0] + if read_length == "Ten_strain_IAV": + sampler = "learn_error_params" + win_min_ext = 0.5 + else: + sampler = "use_quality_scores" + + dname_work.mkdir(parents=True, exist_ok=True) + if fname_insert_bed == []: + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + ], + cwd=dname_work, + ) + else: + # insert bed file is there + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "-z", + Path(fname_insert_bed).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + ], + cwd=dname_work, + ) + + (dname_work / "snv" / "SNVs_0.010000_final.vcf").rename(fname_results_snv) + + + mypath = (dname_work /'support').resolve() + onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] + print('onlyfiles', onlyfiles) + for file in onlyfiles: + if 'reads-support.fas' in file: + file_name = onlyfiles[0] + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + if file.endswith('.gz'): + fname_zipped = (dname_work / "support" / onlyfiles[0]).resolve() + fname_haplos = onlyfiles[0].split('.gz')[0] + fname_unzipped = (dname_work / "support" / fname_haplos).resolve() + # unzip + gunzip(fname_zipped, fname_result_haplos) + + elif file.endswith('.fas'): + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + (dname_work / "support" / file).rename(fname_result_haplos) + + # fix frequency information + + freq_list = [] + for record in SeqIO.parse(fname_result_haplos, "fasta"): + freq_list.append(float(record.description.split('ave_reads=')[-1])) + norm_freq_list = [float(i)/sum(freq_list) for i in freq_list] + + record_list = [] + for idx, record in enumerate(SeqIO.parse(fname_result_haplos, "fasta")): + record.description = f"freq:{norm_freq_list[idx]}" + record_list.append(record) + SeqIO.write(record_list, fname_result_haplos, "fasta") + + +if __name__ == "__main__": + main( + Path(snakemake.input.fname_bam), + Path(snakemake.input.fname_reference), + snakemake.input.fname_insert_bed, + Path(snakemake.output.fname_result), + Path(snakemake.output.fname_result_haplos), + Path(snakemake.output.dname_work), + )