-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add cond to suppress wf and remove empty.sh
- Loading branch information
1 parent
124afdc
commit 8b9be0c
Showing
31 changed files
with
183,743 additions
and
160 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
183,246 changes: 183,246 additions & 0 deletions
183,246
input_examples/amplicon/ERR632171_FASTQ.fasta
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
#!/usr/bin/python3 | ||
import glob | ||
import argparse | ||
import sys | ||
import os | ||
from Bio import SeqIO | ||
import gzip | ||
import shutil | ||
|
||
|
||
def get_avg_length(masked_its): # get average length of longest ITS sequences - separated by 'N' | ||
if masked_its is not None: | ||
if os.path.exists(masked_its): | ||
all_lengths = [] | ||
with gzip.open(masked_its, 'rt') as unzipped_file: | ||
for record in SeqIO.parse(unzipped_file, 'fasta'): | ||
sequences = [x for x in record.seq.split('N') if x and x != ''] | ||
longest_seq = {'num': 0, 'letters': ''} | ||
for seq in sequences: | ||
length = len(seq) | ||
if length > longest_seq['num']: | ||
longest_seq['num'] = length | ||
longest_seq['letters'] = seq | ||
all_lengths.append(longest_seq['num']) | ||
return int(sum(all_lengths) / len(all_lengths)) | ||
else: | ||
return 0 | ||
else: | ||
return 0 | ||
|
||
|
||
def hits_to_num_ratio(fasta, input_folder): # ratio of mapseq hits to number of total seqs LSU/SSU | ||
rna_sum, rna_num = [0 for _ in range(2)] | ||
rna = os.path.join(input_folder, '*.tsv') | ||
if 'empty' not in os.path.relpath(fasta): | ||
with open(glob.glob(rna)[0], 'r') as rna_hits: | ||
for line in rna_hits: | ||
if not line.startswith('#'): | ||
rna_sum += float(line.split('\t')[1]) | ||
rna_num = len([1 for line in gzip.open(fasta, 'rt') if line.startswith('>')]) | ||
return float(rna_sum / rna_num) | ||
else: | ||
return 0 | ||
|
||
|
||
def validate_hits(ssu_fasta, lsu_fasta, ssu_folder, lsu_folder, len_avg): # check length and ratio and assign tag | ||
ssu_ratio = hits_to_num_ratio(ssu_fasta, ssu_folder) if ssu_folder is not None else 0 | ||
lsu_ratio = hits_to_num_ratio(lsu_fasta, lsu_folder) if lsu_folder is not None else 0 | ||
if len_avg > 200: | ||
if ssu_ratio or lsu_ratio > 0.1: | ||
return 'both' | ||
else: | ||
return 'ITS' | ||
elif 120 <= len_avg <= 199: | ||
if ssu_ratio or lsu_ratio > 0.1: | ||
return 'rRNA' | ||
else: | ||
return 'ITS' | ||
else: | ||
return 'rRNA' | ||
|
||
|
||
def suppress_dir(flag, lsu, ssu, its, its_file, ssu_file, lsu_file): | ||
suppressed_folder = 'suppressed' | ||
os.mkdir('suppressed') | ||
taxonomy_summary = 'taxonomy-summary' | ||
os.mkdir('taxonomy-summary') | ||
|
||
its_filename = os.path.basename(its_file) if its is not None else '' | ||
lsu_filename = os.path.basename(lsu_file) if lsu is not None else '' | ||
ssu_filename = os.path.basename(ssu_file) if ssu is not None else '' | ||
|
||
# move dir by tag | ||
list_folders, list_files = [[] for _ in range(2)] | ||
addition = '' | ||
for folder, name, cur_file, filename in zip([lsu, ssu, its], | ||
['/LSU', '/SSU', '/its'], | ||
[lsu_file, ssu_file, its_file], | ||
[lsu_filename, ssu_filename, its_filename]): | ||
if folder is not None: | ||
if flag == 'ITS': | ||
if name == '/its': | ||
list_folders.append((folder, taxonomy_summary + name)) | ||
list_files.append((cur_file, filename)) | ||
else: | ||
list_folders.append((folder, suppressed_folder + name)) | ||
list_files.append((cur_file, suppressed_folder)) | ||
elif flag == 'rRNA': | ||
if name == '/its': | ||
list_folders.append((folder, suppressed_folder + name)) | ||
list_files.append((cur_file, suppressed_folder)) | ||
else: | ||
list_folders.append((folder, taxonomy_summary + name)) | ||
list_files.append((cur_file, filename)) | ||
elif flag == 'both': | ||
list_folders.append((folder, name)) | ||
list_files.append((cur_file, filename)) | ||
addition = taxonomy_summary | ||
|
||
[shutil.copytree(src, addition + dest) for src, dest in list_folders] | ||
[shutil.copy(src, dest) for src, dest in list_files] | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description="get average length of ITS sequences and suppress unwanted folders") | ||
parser.add_argument("--lsu-file", dest="lsu_file", help="lsu fasta", required=False, default=None) | ||
parser.add_argument("--ssu-file", dest="ssu_file", help="ssu fasta", required=False, default=None) | ||
parser.add_argument("--its-file", dest="its_file", help="its fasta", required=False, default=None) | ||
parser.add_argument("--lsu-dir", dest="lsu_directory", help="directory in path taxonomy-summary/LSU", | ||
required=False, default=None) | ||
parser.add_argument("--ssu-dir", dest="ssu_directory", help="directory in path taxonomy-summary/SSU", | ||
required=False, default=None) | ||
parser.add_argument("--its-dir", dest="its_directory", help="directory in path taxonomy-summary/its", | ||
required=False, default=None) | ||
|
||
|
||
if len(sys.argv) < 3: | ||
parser.print_help() | ||
else: | ||
args = parser.parse_args() | ||
avg = get_avg_length(args.its_file) | ||
print('average ITS length is ' + str(avg)) | ||
print('suppressing...') | ||
suppress_flag = validate_hits(args.ssu_file, args.lsu_file, args.ssu_directory, args.lsu_directory, avg) | ||
print(suppress_flag) | ||
suppress_dir(suppress_flag, args.lsu_directory, args.ssu_directory, args.its_directory, args.its_file, | ||
args.ssu_file, args.lsu_file) | ||
if len(os.listdir('suppressed')) == 0: | ||
os.rmdir('suppressed') | ||
if len(os.listdir('taxonomy-summary')) == 0: | ||
os.rmdir('taxonomy-summary') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,19 @@ | ||
ssu_file: | ||
class: File | ||
path: SSU.fasta.gz | ||
path: test-input/SSU/SSU.fasta.gz | ||
lsu_file: | ||
class: File | ||
path: LSU.fasta.gz | ||
path: test-input/LSU/LSU.fasta.gz | ||
its_file: | ||
class: File | ||
path: ITS_masked.fasta.gz | ||
path: test-input/its/its.fasta.gz | ||
lsu_dir: | ||
class: Directory | ||
path: taxonomy-summary/LSU | ||
path: test-input/LSU | ||
ssu_dir: | ||
class: Directory | ||
path: taxonomy-summary/SSU | ||
path: test-input/SSU | ||
its_dir: | ||
class: Directory | ||
path: taxonomy-summary/its | ||
path: test-input/its | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Constructed from biom file | ||
# OTU ID LSU_rRNA taxonomy taxid | ||
25676 3.0 sk__Eukaryota 2759 | ||
16424 1.0 sk__Eukaryota;k__Fungi 4751 | ||
20472 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Leotiomycetes 147548 | ||
1400 1.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes 147550 | ||
4544 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales 5125 | ||
15497 6.0 sk__Eukaryota;k__Fungi;p__Basidiomycota;c__Agaricomycetes 155619 | ||
2195 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes 214506 | ||
14320 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus 1129544 | ||
19603 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus;s__Rhizophagus_intraradices 4876 | ||
17712 3.0 sk__Eukaryota;k__Viridiplantae;p__Streptophyta;c__Magnoliopsida 3398 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Constructed from biom file | ||
# OTU ID SSU_rRNA taxonomy taxid | ||
25676 3.0 sk__Eukaryota 2759 | ||
16424 1.0 sk__Eukaryota;k__Fungi 4751 | ||
20472 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Leotiomycetes 147548 | ||
1400 1.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes 147550 | ||
4544 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales 5125 | ||
15497 6.0 sk__Eukaryota;k__Fungi;p__Basidiomycota;c__Agaricomycetes 155619 | ||
2195 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes 214506 | ||
14320 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus 1129544 | ||
19603 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus;s__Rhizophagus_intraradices 4876 | ||
17712 3.0 sk__Eukaryota;k__Viridiplantae;p__Streptophyta;c__Magnoliopsida 3398 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
export PATH="$HOME/node-v8.11.1:$PATH" | ||
|
||
|
||
# scripts path | ||
#SCRIPTS_PATHS=$(readlink -f "../../bin") | ||
#PATH="$SCRIPTS_PATHS":$PATH | ||
|
||
cwltest "$@" --tool cwltool -- --enable-dev |
Oops, something went wrong.