Skip to content

Commit

Permalink
add cond to suppress wf and remove empty.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
KateSakharova committed Aug 6, 2020
1 parent 124afdc commit 8b9be0c
Show file tree
Hide file tree
Showing 31 changed files with 183,743 additions and 160 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ services:

env:
- TOOL="Validate" CONF_SCRIPT="travis/conformance-tests.sh"
# - TOOL="SUPPRESS" TEST_SUPPRESS_SUBWF="travis/cwltest.sh"

install:
- travis/install-nodejs.sh
Expand All @@ -19,3 +20,4 @@ before_script:

script:
- bash ${CONF_SCRIPT}
# - bash ${TEST_SUPPRESS_SUBWF} travis/tests/amplicon-suppress/1.yml
183,246 changes: 183,246 additions & 0 deletions input_examples/amplicon/ERR632171_FASTQ.fasta

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -135,14 +135,14 @@ steps:

# gzip embl
gzipped_embl:
run: ../../../../utils/gzip.cwl
run: ../../../../utils/pigz/gzip.cwl
in:
uncompressed_file: unite_embl/result
out: [ compressed_file ]

# gzip gbk
gzipped_gbk:
run: ../../../../utils/gzip.cwl
run: ../../../../utils/pigz/gzip.cwl
in:
uncompressed_file: unite_gbk/result
out: [ compressed_file ]
Expand Down
2 changes: 1 addition & 1 deletion tools/RNA_prediction/biom-convert/biom-convert.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ baseCommand: [ "biom-convert.sh" ]

inputs:
biom:
type: File
type: File?
format: edam:format_3746 # BIOM
inputBinding:
prefix: --input-fp
Expand Down
131 changes: 131 additions & 0 deletions tools/mask-for-ITS/its-length-new.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/python3
import glob
import argparse
import sys
import os
from Bio import SeqIO
import gzip
import shutil


def get_avg_length(masked_its): # get average length of longest ITS sequences - separated by 'N'
if masked_its is not None:
if os.path.exists(masked_its):
all_lengths = []
with gzip.open(masked_its, 'rt') as unzipped_file:
for record in SeqIO.parse(unzipped_file, 'fasta'):
sequences = [x for x in record.seq.split('N') if x and x != '']
longest_seq = {'num': 0, 'letters': ''}
for seq in sequences:
length = len(seq)
if length > longest_seq['num']:
longest_seq['num'] = length
longest_seq['letters'] = seq
all_lengths.append(longest_seq['num'])
return int(sum(all_lengths) / len(all_lengths))
else:
return 0
else:
return 0


def hits_to_num_ratio(fasta, input_folder): # ratio of mapseq hits to number of total seqs LSU/SSU
rna_sum, rna_num = [0 for _ in range(2)]
rna = os.path.join(input_folder, '*.tsv')
if 'empty' not in os.path.relpath(fasta):
with open(glob.glob(rna)[0], 'r') as rna_hits:
for line in rna_hits:
if not line.startswith('#'):
rna_sum += float(line.split('\t')[1])
rna_num = len([1 for line in gzip.open(fasta, 'rt') if line.startswith('>')])
return float(rna_sum / rna_num)
else:
return 0


def validate_hits(ssu_fasta, lsu_fasta, ssu_folder, lsu_folder, len_avg): # check length and ratio and assign tag
ssu_ratio = hits_to_num_ratio(ssu_fasta, ssu_folder) if ssu_folder is not None else 0
lsu_ratio = hits_to_num_ratio(lsu_fasta, lsu_folder) if lsu_folder is not None else 0
if len_avg > 200:
if ssu_ratio or lsu_ratio > 0.1:
return 'both'
else:
return 'ITS'
elif 120 <= len_avg <= 199:
if ssu_ratio or lsu_ratio > 0.1:
return 'rRNA'
else:
return 'ITS'
else:
return 'rRNA'


def suppress_dir(flag, lsu, ssu, its, its_file, ssu_file, lsu_file):
suppressed_folder = 'suppressed'
os.mkdir('suppressed')
taxonomy_summary = 'taxonomy-summary'
os.mkdir('taxonomy-summary')

its_filename = os.path.basename(its_file) if its is not None else ''
lsu_filename = os.path.basename(lsu_file) if lsu is not None else ''
ssu_filename = os.path.basename(ssu_file) if ssu is not None else ''

# move dir by tag
list_folders, list_files = [[] for _ in range(2)]
addition = ''
for folder, name, cur_file, filename in zip([lsu, ssu, its],
['/LSU', '/SSU', '/its'],
[lsu_file, ssu_file, its_file],
[lsu_filename, ssu_filename, its_filename]):
if folder is not None:
if flag == 'ITS':
if name == '/its':
list_folders.append((folder, taxonomy_summary + name))
list_files.append((cur_file, filename))
else:
list_folders.append((folder, suppressed_folder + name))
list_files.append((cur_file, suppressed_folder))
elif flag == 'rRNA':
if name == '/its':
list_folders.append((folder, suppressed_folder + name))
list_files.append((cur_file, suppressed_folder))
else:
list_folders.append((folder, taxonomy_summary + name))
list_files.append((cur_file, filename))
elif flag == 'both':
list_folders.append((folder, name))
list_files.append((cur_file, filename))
addition = taxonomy_summary

[shutil.copytree(src, addition + dest) for src, dest in list_folders]
[shutil.copy(src, dest) for src, dest in list_files]


if __name__ == '__main__':
parser = argparse.ArgumentParser(description="get average length of ITS sequences and suppress unwanted folders")
parser.add_argument("--lsu-file", dest="lsu_file", help="lsu fasta", required=False, default=None)
parser.add_argument("--ssu-file", dest="ssu_file", help="ssu fasta", required=False, default=None)
parser.add_argument("--its-file", dest="its_file", help="its fasta", required=False, default=None)
parser.add_argument("--lsu-dir", dest="lsu_directory", help="directory in path taxonomy-summary/LSU",
required=False, default=None)
parser.add_argument("--ssu-dir", dest="ssu_directory", help="directory in path taxonomy-summary/SSU",
required=False, default=None)
parser.add_argument("--its-dir", dest="its_directory", help="directory in path taxonomy-summary/its",
required=False, default=None)


if len(sys.argv) < 3:
parser.print_help()
else:
args = parser.parse_args()
avg = get_avg_length(args.its_file)
print('average ITS length is ' + str(avg))
print('suppressing...')
suppress_flag = validate_hits(args.ssu_file, args.lsu_file, args.ssu_directory, args.lsu_directory, avg)
print(suppress_flag)
suppress_dir(suppress_flag, args.lsu_directory, args.ssu_directory, args.its_directory, args.its_file,
args.ssu_file, args.lsu_file)
if len(os.listdir('suppressed')) == 0:
os.rmdir('suppressed')
if len(os.listdir('taxonomy-summary')) == 0:
os.rmdir('taxonomy-summary')
7 changes: 3 additions & 4 deletions tools/mask-for-ITS/its-length.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env /hps/nobackup2/production/metagenomics/pipeline/tools-v5/miniconda3-4.6.14/bin/python3
#!/usr/bin/python3
import glob
import argparse
import sys
Expand All @@ -25,7 +25,7 @@ def get_avg_length(masked_its): # get average length of longest ITS sequences -
else:
return 0

def hits_to_num_ratio(fasta, input_folder): # ratio of mapseq hits to number of total seqs LSU/SSU
def hits_to_num_ratio(fasta, input_folder): # ratio of mapseq hits to number of total seqs LSU/SSU
rna_sum, rna_num = [0 for _ in range(2)]
rna = os.path.join(input_folder, '*.tsv')
if 'empty' not in os.path.relpath(fasta):
Expand Down Expand Up @@ -93,5 +93,4 @@ def suppress_dir(flag, lsu, ssu, its, its_file, ssu_file, lsu_file):
validate_hits(args.ssu_file, args.lsu_file, args.ssu_directory, args.lsu_directory, avg)
print('suppressing...')
suppress_flag = validate_hits(args.ssu_file, args.lsu_file, args.ssu_directory, args.lsu_directory, avg)
suppress_dir(suppress_flag, args.lsu_directory, args.ssu_directory, args.its_directory, args.its_file, args.ssu_file, args.lsu_file)

suppress_dir(suppress_flag, args.lsu_directory, args.ssu_directory, args.its_directory, args.its_file, args.ssu_file, args.lsu_file)
32 changes: 15 additions & 17 deletions tools/mask-for-ITS/suppress_tax.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -11,51 +11,49 @@ requirements:

inputs:
ssu_file:
type: File
type: File?
inputBinding:
prefix: --ssu-file
lsu_file:
type: File
type: File?
inputBinding:
prefix: --lsu-file
its_file:
type: File
type: File?
inputBinding:
prefix: --its-file
lsu_dir:
type: Directory
default: "LSU"
type: Directory?
# default: "LSU"
inputBinding:
prefix: --lsu-dir
ssu_dir:
type: Directory
default: "SSU"
type: Directory?
# default: "SSU"
inputBinding:
prefix: --ssu-dir
its_dir:
type: Directory
default: "its"
type: Directory?
# default: "its"
inputBinding:
prefix: --its-dir


baseCommand: [its-length.py]
baseCommand: [ its-length-new.py ]
stdout: ITS_LENGTH

outputs:
stdout: stdout
its_length: stdout
out_tax:
type: Directory
type: Directory?
outputBinding:
glob: "taxonomy-summary"
out_suppress:
type: Directory
type: Directory?
outputBinding:
glob: "suppressed"
out_fastas:
type:
type: array
items: File
out_fastas_tax:
type: File[]?
outputBinding:
glob: "*.fasta.gz"

Expand Down
13 changes: 7 additions & 6 deletions tools/mask-for-ITS/suppress_tax_test.yml
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
ssu_file:
class: File
path: SSU.fasta.gz
path: test-input/SSU/SSU.fasta.gz
lsu_file:
class: File
path: LSU.fasta.gz
path: test-input/LSU/LSU.fasta.gz
its_file:
class: File
path: ITS_masked.fasta.gz
path: test-input/its/its.fasta.gz
lsu_dir:
class: Directory
path: taxonomy-summary/LSU
path: test-input/LSU
ssu_dir:
class: Directory
path: taxonomy-summary/SSU
path: test-input/SSU
its_dir:
class: Directory
path: taxonomy-summary/its
path: test-input/its

Binary file added tools/mask-for-ITS/test-input/LSU/LSU.fasta.gz
Binary file not shown.
12 changes: 12 additions & 0 deletions tools/mask-for-ITS/test-input/LSU/LSU.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Constructed from biom file
# OTU ID LSU_rRNA taxonomy taxid
25676 3.0 sk__Eukaryota 2759
16424 1.0 sk__Eukaryota;k__Fungi 4751
20472 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Leotiomycetes 147548
1400 1.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes 147550
4544 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales 5125
15497 6.0 sk__Eukaryota;k__Fungi;p__Basidiomycota;c__Agaricomycetes 155619
2195 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes 214506
14320 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus 1129544
19603 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus;s__Rhizophagus_intraradices 4876
17712 3.0 sk__Eukaryota;k__Viridiplantae;p__Streptophyta;c__Magnoliopsida 3398
Binary file added tools/mask-for-ITS/test-input/SSU/SSU.fasta.gz
Binary file not shown.
12 changes: 12 additions & 0 deletions tools/mask-for-ITS/test-input/SSU/SSU.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Constructed from biom file
# OTU ID SSU_rRNA taxonomy taxid
25676 3.0 sk__Eukaryota 2759
16424 1.0 sk__Eukaryota;k__Fungi 4751
20472 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Leotiomycetes 147548
1400 1.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes 147550
4544 2.0 sk__Eukaryota;k__Fungi;p__Ascomycota;c__Sordariomycetes;o__Hypocreales 5125
15497 6.0 sk__Eukaryota;k__Fungi;p__Basidiomycota;c__Agaricomycetes 155619
2195 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes 214506
14320 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus 1129544
19603 1.0 sk__Eukaryota;k__Fungi;p__Mucoromycota;c__Glomeromycetes;o__Glomerales;f__Glomeraceae;g__Rhizophagus;s__Rhizophagus_intraradices 4876
17712 3.0 sk__Eukaryota;k__Viridiplantae;p__Streptophyta;c__Magnoliopsida 3398
Binary file added tools/mask-for-ITS/test-input/its/its.fasta.gz
Binary file not shown.
9 changes: 9 additions & 0 deletions travis/cwltest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
export PATH="$HOME/node-v8.11.1:$PATH"


# scripts path
#SCRIPTS_PATHS=$(readlink -f "../../bin")
#PATH="$SCRIPTS_PATHS":$PATH

cwltest "$@" --tool cwltool -- --enable-dev
Loading

0 comments on commit 8b9be0c

Please sign in to comment.