diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..4638720 Binary files /dev/null and b/.DS_Store differ diff --git a/10_SCA1/bcoexample_10.json b/10_SCA1/bcoexample_10.json index 4411c68..1e55f16 100644 --- a/10_SCA1/bcoexample_10.json +++ b/10_SCA1/bcoexample_10.json @@ -5,7 +5,11 @@ "version": "1.0", "createdby": "amandab2140@gwmail.gwu.edu", "created": "Feb 02, 2017 11:11:00", - "modified": "Feb 23, 2017 15:42:38", + "modified": "Feb 23, 2017 15:42:38", + "authors": [ + {"orcid": "0000-0003-1409-4549"}, + {"orcid": "0000-0002-9920-565X"} + ], "digital_signature": "", "verification_status": "unreviewed", "publication_status": "draft", @@ -18,10 +22,6 @@ "Full-penetrance: 39 and above CAG trinucleotide repeats." ], "description_domain": { - "authors": [ - {"orchid": "0000-0003-1409-4549"}, - {"orchid": "0000-0002-9920-565X"} - ], "xref":[ "taxID:9606", "DO:0050954", @@ -69,10 +69,24 @@ "pipeline_version": "0.2", "script": "https://hive.biochemistry.gwu.edu/workflows/antiviral_resistance_detection_hive.sh", "driver": "//hive.biochemistry.gwu.edu/hive-driver", - "prerequisites": [ + "software_prerequisites": [ {"name":"HIVE_hexagon","version":"1.3"}, {"name":"HIVE_heptagon","version":"1.3"} - ] + ], + "domain_prerequisites": [ + { + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/", + "name": "access to e-utils" + }, + { + "url": "ftp://:22/", + "name": "access to ftp" + } + ], + "env_parameters": [ + "HIVEv1.3" + ], + "script_type": "URI" }, "parametric_domain": { "trinucleotide_count_file": "36" diff --git a/1_HCV1a/bcoexample_1.json b/1_HCV1a/bcoexample_1.json index 1ae3492..be042f9 100644 --- a/1_HCV1a/bcoexample_1.json +++ b/1_HCV1a/bcoexample_1.json @@ -1,136 +1,132 @@ { - "id": "obj.1270", - "name": "HCV1a [taxID:31646] ledipasvir [PubChem:67505836] resistance SNP [SO:0000694] detection", - "version": "1.1", - "createdby": "hadley_king@gwmail.gwu.edu", - "created": "Jan 24, 2017 09:40:17", - "modified": "Jun 22, 2017 14:47:49", - "digital_signature": "905d7fce3f3ac64c8ea86f058ca71658", - "verification_status": "unreviewed", - "publication_status": "draft", - "usability_domain": [ - "Identify baseline single nucleotide polymorphisms SNPs [SO:0000694], insertions [SO:0000667], and deletions [SO:0000045] that correlate with reduced ledipasvir [PubChem:67505836] antiviral drug efficacy in Hepatitis C virus subtype 1 [taxID:31646]", - "Identify treatment emergent amino acid substitutions [SO:0000048] that correlate with antiviral drug treatment failure", - "Determine whether the treatment emergent amino acid substitutions [SO:0000048] identified correlate with treatment failure involving other drugs against the same virus", - "GitHub CWL example: https://github.com/mr-c/hive-cwl-examples/blob/master/workflow/hive-viral-mutation-detection.cwl#L20" - ], - "authors": [ - { - "orcid": "0000-0003-1409-4549" - }, - { - "name": "Eric Donaldson" - } - ], - "description_domain": { - "keywords": [ - "HCV1a", - "Ledipasvir", - "antiviral resistance", - "SNP", - "amino acid substitutions" - "test2" - ], - "xref": [ - "SO:0000694", - "SO:0000667", - "SO:0000045", - "PubChem:67505836", - "SO:0000048", - "taxID:31646", - "PMID:25123381", - "PMID:26508693" - ], - "pipeline_steps": [ - { - "tool_name": "HIVE-hexagon", - "tool_desc": "The tool is used for alignment of reads to a set of references", - "tool_version": "1.3", - "tool_package": "HIVE", - "step_number": "1", - "input_uri_list": [ - "https://www.ncbi.nlm.nih.gov/nuccore/NC_004102.1", - "https://www.ncbi.nlm.nih.gov/nuccore/AJ238799.1", - "https://hive/nuc-read/557406", - "https://hive/nuc-read/557407" - ], - "output_uri_list": [ - "https://hive/data/557413/allCount-aligned.csv" - ] - }, - { - "tool_name": "HIVE-heptagon", - "tool_desc": "This tool is used for variant calling", - "tool_version": "1.3", - "tool_package": "HIVE", - "step_number": "2", - "input_uri_list": [ - "https://hive/data/557413/allCount-aligned.csv" - ], - "output_uri_list": [ - "https://hive/data/557416/SNPProfile.csv" - ] - } - ] - }, - "execution_domain": { - "script": "https://hive.biochemistry.gwu.edu/workflows/antiviral_resistance_detection_hive.sh", - "pipeline_version": "2.0", - "platform": "HIVE", - "driver": "shell", - "prerequisites": [ - "name:HIVE-hexagon", - "version:1.3 ", - "name:HIVE-heptagon", - "version:1.3" - ], - "env_parameters": [ - "HIVEv1.3" - ], - "script_type": "URI" - }, - "parametric_domain": { - "heptagon_divergence_threshold_percent": "30", - "hexagon_minimum_coverage": "15", - "hexagon_seed": "14", - "heptagon_freq_cutoff": "0.10", - "hexagon_minimum_match_len": "66" - }, - "io_domain": { - "reference_uri_list": [ - "https://www.ncbi.nlm.nih.gov/nuccore/NC_004102.1", - "https://www.ncbi.nlm.nih.gov/nuccore/AJ238799.1" - ], - "input_uri_list": [ - "https://hive/data/nuc-read/557406", - "https://hive/data/nuc-read/557407", - "https://hive/data/557413/allCount-aligned.csv" - ], - "output_uri_list": [ - "https://hive/data/557413/allCount-aligned.csv", - "https//hive/data/557416/SNPProfile.csv" - ], - "output_subdomain": [ - "hit_list ", - "title:hit list", - "nuri:https://hive/data/557413/allCount-aligned.csv", - "mime-type:csv", - "mutation_profile:", - "title:mutation profile", - "uri:https//hive/data/557416/SNPProfile.csv", - "mime-type:csv" - ], - "input_subdomain": [ - "Read_Files:", - "https://hive/data/nuc-read/557406", - "https://hive/data/nuc-read/557407", - "HCV1a_reference_genome", - "https://www.ncbi.nlm.nih.gov/nuccore/NC_004102.1", - "https://www.ncbi.nlm.nih.gov/nuccore/AJ238799.1" - ] - }, - "error_domain": [ - "false negative alignment hits < 0.0010", - "false positive mutation calls discovery < 0.0005" - ] -} + "id": "obj.1270", + "name": "HCV1a [taxonomy:31646] ledipasvir [PubChem:67505836] resistance SNP [so:0000694] detection", + "structured_name": "HCV1a [taxonomy:31646] ledipasvir [PubChem:67505836] resistance SNP [so:0000694] detection", + "version": "1.1", + "createdby": "hadley_king@gwmail.gwu.edu", + "created": "Jan 24, 2017 09:40:17", + "modified": "Jun 25, 2019 14:47:49", + "digital_signature": "905d7fce3f3ac64c8ea86f058ca71658", + "verification_status": "unreviewed", + "publication_status": "draft", + "usability_domain": [ + "Identify baseline single nucleotide polymorphisms SNPs [so:0000694], insertions [so:0000667], and deletions [so:0000045] that correlate with reduced ledipasvir [PubChem:67505836] antiviral drug efficacy in Hepatitis C virus subtype 1 [taxonomy:31646]", + "Identify treatment emergent amino acid substitutions [so:0000048] that correlate with antiviral drug treatment failure", + "Determine whether the treatment emergent amino acid substitutions [so:0000048] identified correlate with treatment failure involving other drugs against the same virus", + "GitHub CWL example: https://github.com/mr-c/hive-cwl-examples/blob/master/workflow/hive-viral-mutation-detection.cwl#L20" + ], + "authors": [ + { + "orcid": "0000-0003-1409-4549" + }, + { + "name": "Eric Donaldson" + } + ], + "description_domain": { + "keywords": [ + "HCV1a", + "Ledipasvir", + "antiviral resistance", + "SNP", + "amino acid substitutions" + ], + "xref": [ + "so:0000694", + "so:0000667", + "so:0000045", + "PubChem:67505836", + "so:0000048", + "taxonomy:31646", + "pubmed:25123381", + "pubmed:26508693" + ], + "pipeline_steps": [ + { + "tool_name": "HIVE-hexagon", + "tool_desc": "The tool is used for alignment of reads to a set of references", + "tool_version": "1.3", + "tool_package": "HIVE", + "step_number": "1", + "input_uri_list": [ + "https://www.ncbi.nlm.nih.gov/nuccore/NC_004102.1", + "https://www.ncbi.nlm.nih.gov/nuccore/AJ238799.1", + "https://hive/nuc-read/557406", + "https://hive/nuc-read/557407" + ], + "output_uri_list": [ + "https://hive/data/557413/allCount-aligned.csv" + ] + }, + { + "tool_name": "HIVE-heptagon", + "tool_desc": "This tool is used for variant calling", + "tool_version": "1.3", + "tool_package": "HIVE", + "step_number": "2", + "input_uri_list": [ + "https://hive/data/557413/allCount-aligned.csv" + ], + "output_uri_list": [ + "https://hive/data/557416/SNPProfile.csv" + ] + } + ] + }, + "execution_domain": { + "script": "https://hive.biochemistry.gwu.edu/workflows/antiviral_resistance_detection_hive.sh", + "pipeline_version": "2.0", + "platform": "HIVE", + "driver": "shell", + "software_prerequisites": [ + { + "name": "HIVE_hexagon", + "version": "1.3" + }, + { + "name": "HIVE_heptagon", + "version": "1.3" + } + ], + "domain_prerequisites": [ + { + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/", + "name": "access to e-utils" + }, + { + "url": "ftp://:22/", + "name": "access to ftp" + } + ], + "env_parameters": [ + "HIVEv1.3" + ], + "script_type": "URI" + }, + "parametric_domain": { + "heptagon_divergence_threshold_percent": "30", + "hexagon_minimum_coverage": "15", + "hexagon_seed": "14", + "heptagon_freq_cutoff": "0.10", + "hexagon_minimum_match_len": "66" + }, + "io_domain": { + "reference_uri_list": [ + "https://www.ncbi.nlm.nih.gov/nuccore/NC_004102.1", + "https://www.ncbi.nlm.nih.gov/nuccore/AJ238799.1" + ], + "input_uri_list": [ + "https://hive/data/nuc-read/557406", + "https://hive/data/nuc-read/557407", + "https://hive/data/557413/allCount-aligned.csv" + ], + "output_uri_list": [ + "https://hive/data/557413/allCount-aligned.csv", + "https//hive/data/557416/SNPProfile.csv" + ] + }, + "error_domain": [ + "false negative alignment hits < 0.0010", + "false positive mutation calls discovery < 0.0005" + ] +} \ No newline at end of file diff --git a/2_Gut/bcoexample_2.json b/2_Gut/bcoexample_2.json index 23ab837..285ab35 100644 --- a/2_Gut/bcoexample_2.json +++ b/2_Gut/bcoexample_2.json @@ -1,115 +1,136 @@ { - "id": "obj.1276", - "name": "Healthy human [taxID:9606] fecal [UBERON:0001988] metagenomic diversity", - "version": "1.2", - "createdby": "hadley_king@gwmail.gwu.edu", - "created": "Jan 26, 2017 16:35:29", - "modified": "May 11, 2017 11:43:42", - "digital_signature": "QSEQa6HNzERVIMONZWEj", - "verification_status": "unreviewed", - "publication_status": "draft", - "usability_domain": [ - "Identify the most common organism present in a human [taxID:9606] fecal [UBERON:0001988] sample, ", - "Identify the general community composition of organisms in a human [taxID:9606] fecal [UBERON:0001988] sample, ", - "CensuScope is used to do a census of the composition of the read files. Based on a user-defined threshold, organisms identified are used for alignment in the Hexagon alignment." - ], - "authors": [ - { - "orcid": "0000-0003-1409-4549" - }, - { - "orcid": "0000-0002-8138-8312" - } - ], - "description_domain": { - "keywords": [ - "metagenome", - "metagenomic analysis", - "fecal" - ], - "xref": [ - "UBERON:0001988", - "taxID:9606" - ], - "pipeline_steps": [ - { - "tool_name": "HIVE_CensuScope", - "tool_desc": "Detect taxonomic composition of a metagenomic data set.", - "tool_version": "1.3", - "tool_package": "null", - "step_number": "1", - "input_uri_list": [ - "hive://nuc-read/514683", - "hive://nuc-read/514682", - "hive://genome/513957" - ], - "output_uri_list": [ - "hive:///data/524669/dnaAccessionBasedResult.csv" - ] - }, - { - "tool_name": "HIVE_hexagon", - "tool_desc": "Alignment of reads to a set of references", - "tool_version": "1.3", - "tool_package": "null", - "step_number": "2", - "input_uri_list": [ - "https://www.ncbi.nlm.nih.gov/nuccore/CP000139.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929042.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929046.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929045.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929043.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929048.1", - "hive:///data/524669/dnaAccessionBasedResult.csv" - ], - "output_uri_list": [ - "hive://data/524569/alCount-Unalignedo524569-alCount--1.csv" - ] - } - ] - }, - "execution_domain": { - "script": "hive://workflows/human_gut_metagenomic_diversity.py", - "pipeline_version": "2.0", - "platform": "HIVE", - "driver": "python_v2.7.13", - "prerequisites": [ - "{\"name\":\"HIVE_censuscope\", \"version\": \"1.3\"},", - "{\"name\":\"HIVE_hexagon\", \"version\": \"1.3\"}" - ] - }, - "parametric_domain": { - "complexityRefEntropy": "1.2 - reluctant", - "maximumPercentLowQualityAllowed": "15", - "keepMarkovnikovMatches": "On", - "storeAlignments": "Yes", - "keepAllMatches": "All equally best alternative matches", - "selfStopping": "No", - "CensuslimitIterations": "5", - "sample": "2500", - "acceptNNNQuaTrheshold": "filter Ns only", - "complexityRefWindow": "30", - "taxDepth": "leaf", - "minMatchLen": "45" - }, - "io_domain": { - "reference_uri_list": [ - "https://www.ncbi.nlm.nih.gov/nuccore/CP000139.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929042.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929046.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929045.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929043.1", - "https://www.ncbi.nlm.nih.gov/nuccore/FP929048.1", - "hive://genome/513957" - ], - "input_uri_list": [ - "https://hive.biochemistry.gwu.edu/nuc-read/514683", - "https://hive.biochemistry.gwu.edu/nuc-read/514682" - ], - "output_uri_list": [ - "https://hive.biochemistry.gwu.edu/data/524669/dnaAccessionBasedResult.csv", - "https://hive.biochemistry.gwu.edu/data/524569/Unaligned Reads (HIVE_ID).fasta", - "https://hive.biochemistry.gwu.edu/data/524569/alCount-Unalignedo524569-alCount--1.csv" - ] - } + "id": "obj.1276", + "name": "Healthy human [taxID:9606] fecal [UBERON:0001988] metagenomic diversity", + "structured_name": "Healthy human [taxID:9606] fecal [UBERON:0001988] metagenomic diversity", + "version": "1.2", + "createdby": "hadley_king@gwmail.gwu.edu", + "created": "Jan 26, 2017 16:35:29", + "modified": "May 11, 2017 11:43:42", + "digital_signature": "QSEQa6HNzERVIMONZWEj", + "verification_status": "unreviewed", + "publication_status": "draft", + "usability_domain": [ + "Identify the most common organism present in a human [taxID:9606] fecal [UBERON:0001988] sample, ", + "Identify the general community composition of organisms in a human [taxID:9606] fecal [UBERON:0001988] sample, ", + "CensuScope is used to do a census of the composition of the read files. Based on a user-defined threshold, organisms identified are used for alignment in the Hexagon alignment." + ], + "authors": [ + { + "orcid": "0000-0003-1409-4549" + }, + { + "orcid": "0000-0002-8138-8312" + } + ], + "description_domain": { + "keywords": [ + "metagenome", + "metagenomic analysis", + "fecal" + ], + "xref": [ + "UBERON:0001988", + "taxID:9606" + ], + "pipeline_steps": [ + { + "tool_name": "HIVE_CensuScope", + "tool_desc": "Detect taxonomic composition of a metagenomic data set.", + "tool_version": "1.3", + "tool_package": "null", + "step_number": "1", + "input_uri_list": [ + "hive://nuc-read/514683", + "hive://nuc-read/514682", + "hive://genome/513957" + ], + "output_uri_list": [ + "hive:///data/524669/dnaAccessionBasedResult.csv" + ] + }, + { + "tool_name": "HIVE_hexagon", + "tool_desc": "Alignment of reads to a set of references", + "tool_version": "1.3", + "tool_package": "null", + "step_number": "2", + "input_uri_list": [ + "https://www.ncbi.nlm.nih.gov/nuccore/CP000139.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929042.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929046.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929045.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929043.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929048.1", + "hive:///data/524669/dnaAccessionBasedResult.csv" + ], + "output_uri_list": [ + "hive://data/524569/alCount-Unalignedo524569-alCount--1.csv" + ] + } + ] + }, + "execution_domain": { + "script": "hive://workflows/human_gut_metagenomic_diversity.py", + "pipeline_version": "2.0", + "platform": "HIVE", + "driver": "python_v2.7.13", + "software_prerequisites": [ + { + "name": "HIVE_censuscope", + "version": "1.3" + }, + { + "name": "HIVE_hexagon", + "version": "1.3" + } + ], + "domain_prerequisites": [ + { + "url": "protocol://domain:port/application/path", + "name": "generic name" + }, + { + "url": "ftp://:22/", + "name": "access to ftp" + } + ], + "env_parameters": [ + "HIVEv1.3" + ], + "script_type": "URI" + }, + "parametric_domain": { + "complexityRefEntropy": "1.2 - reluctant", + "maximumPercentLowQualityAllowed": "15", + "keepMarkovnikovMatches": "On", + "storeAlignments": "Yes", + "keepAllMatches": "All equally best alternative matches", + "selfStopping": "No", + "CensuslimitIterations": "5", + "sample": "2500", + "acceptNNNQuaTrheshold": "filter Ns only", + "complexityRefWindow": "30", + "taxDepth": "leaf", + "minMatchLen": "45" + }, + "io_domain": { + "reference_uri_list": [ + "https://www.ncbi.nlm.nih.gov/nuccore/CP000139.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929042.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929046.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929045.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929043.1", + "https://www.ncbi.nlm.nih.gov/nuccore/FP929048.1", + "hive://genome/513957" + ], + "input_uri_list": [ + "https://hive.biochemistry.gwu.edu/nuc-read/514683", + "https://hive.biochemistry.gwu.edu/nuc-read/514682" + ], + "output_uri_list": [ + "https://hive.biochemistry.gwu.edu/data/524669/dnaAccessionBasedResult.csv", + "https://hive.biochemistry.gwu.edu/data/524569/Unaligned Reads (HIVE_ID).fasta", + "https://hive.biochemistry.gwu.edu/data/524569/alCount-Unalignedo524569-alCount--1.csv" + ] + } } \ No newline at end of file diff --git a/3_CAGrepeats/bcoexample_3.json b/3_CAGrepeats/bcoexample_3.json index cba67b3..0cbe535 100644 --- a/3_CAGrepeats/bcoexample_3.json +++ b/3_CAGrepeats/bcoexample_3.json @@ -67,10 +67,26 @@ "pipeline_version": "1.0", "platform": "HIVE", "driver": "hive://hive-driver", - "prerequisites": [ - "name:HIVE-hexagon,version:1.3 ", - "name:HIVE-heptagon,version:1.3" - ], + "software_prerequisites": [ + { + "name": "HIVE_hexagon", + "version": "1.3" + }, + { + "name": "HIVE_heptagon", + "version": "1.3" + } + ], + "domain_prerequisites": [ + { + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/", + "name": "access to e-utils" + }, + { + "url": "ftp://:22/", + "name": "access to ftp" + } + ], "env_parameters": [ "HIVEv1.3" ], diff --git a/4_rnaSeqER-PR/bcoexample_4.json b/4_rnaSeqER-PR/bcoexample_4.json index 0b265b8..a6bce4a 100644 --- a/4_rnaSeqER-PR/bcoexample_4.json +++ b/4_rnaSeqER-PR/bcoexample_4.json @@ -104,17 +104,43 @@ "pipeline_version": "0.0", "platform": "HIVE", "driver": "http://128.164.35.92/cwl-tool", - "prerequisites": [ - "TopHat2", - "Cufflinks", - "Cuffmerge", - "Cuffdif", - "cwl-tool" - ], + "software_prerequisites": [ + { + "name": "TopHat2", + "version": "v2.0.13" + }, + { + "name": "Cufflinks", + "version": "v2.0.13" + }, + { + "name": "Cuffmerge", + "version": "v2.2.1" + }, + { + "name": "Cuffdif", + "version": "v2.2.1" + }, + { + "name": "cwl-tool", + "version": "v2.0.13" + } + ], + "domain_prerequisites": [ + { + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/", + "name": "access to e-utils" + }, + { + "url": "http://128.164.35.92/", + "name": "access to MGPC" + } + ], "env_parameters": [ "cwlVersion: cwl:draft-3" - ] - }, + ], + "script_type": "CWL" + }, "parametric_domain": { "tophat_threads": "4", "tophat_no-coverage-search": "on", diff --git a/5_viralScreening/bcoexample_5.json b/5_viralScreening/bcoexample_5.json index 861bb14..8d13404 100644 --- a/5_viralScreening/bcoexample_5.json +++ b/5_viralScreening/bcoexample_5.json @@ -96,12 +96,34 @@ "pipeline_version": "2.0", "platform": "HIVE", "driver": "shell", - "prerequisites": [ - "HIVE-seq, version1.3", - "HIVE-IDBA-UD, version1.3", - "HIVE-hexagon, version 1.3", - "HIVE-heptagon, version 1.3" - ], + "software_prerequisites": [ + { + "name": "HIVE-seq", + "version": "1.3" + }, + { + "name": "HIVE-IDBA-UD", + "version": "1.3" + }, + { + "name": "HIVE-hexagon", + "version": "1.3" + }, + { + "name": "HIVE-heptagon", + "version": "1.3" + } + ], + "domain_prerequisites": [ + { + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/", + "name": "access to e-utils" + }, + { + "url": "ftp://:22/", + "name": "access to ftp" + } + ], "env_parameters": [ "HIVEv1.3" ], diff --git a/6_clonalDisc/bcoexample_6.json b/6_clonalDisc/bcoexample_6.json index 71964e3..e051b45 100644 --- a/6_clonalDisc/bcoexample_6.json +++ b/6_clonalDisc/bcoexample_6.json @@ -1,154 +1,179 @@ { - "id": "obj.1283", - "name": "Quasispecies analysis of HIV-1 [taxID:12721] samples taken from H1V-1 infected individuals [taxID:9606] and detection of novel HIV-1 recombinant subtypes", - "title": "Quasispecies analysis of HIV-1", - "version": "1.0", - "createdby": "naila_gulzar@gwu.edu", - "created": "Feb 02, 2017 14:37:50", - "modified": "Jun 16, 2017 15:04:43", - "digital_signature": "324kjhgiufgioerfhlsjdhbfalskjd", - "verification_status": "unreviewed", - "publication_status": "draft", - "usability_domain": [ - "This pipeline can be used for discovering the new recombinant subtypes of viruses" - ], - "authors": [ - { - "orcid": "0000-0002-8138-8312" - }, - { - "name": "Bhavna Hora" - }, - { - "name": "Konstantinos Karagiannis " - }, - { - "name": "Krista Smith" - }, - { - "name": "Raja Mazumder" - }, - { - "name": "Feng Gao" - } - ], - "description_domain": { - "keywords": [ - "clonal discovery", - "quasispecies", - "clonal contigs", - "HIV-1" - ], - "xref": [ - "UBERON:0001969", - "taxID:9606", - "taxID:12721" - ], - "pipeline_steps": [ - { - "tool_name": "HIVE-hexagon", - "tool_desc": "Aligns reads to a set of reference", - "tool_version": "1.4.3", - "tool_package": "HIVE", - "step_number": "1", - "input_uri_list": [ - "https://hive/data/nuc-read/645175", - "https://hive/data/nuc-read/645176", - "https://hive/data/nuc-genome/545173" - ], - "output_uri_list": [ - "https://hive/data/hitlist/557006.csv" - ] - }, - { - "tool_name": "HIve-seq", - "tool_desc": "Utility set that allows sequence manipulation", - "tool_version": "1.4.3", - "tool_package": "HIVE", - "step_number": "2", - "input_uri_list": [ - "https://hive/data/hitlist/557006.csv" - ], - "output_uri_list": [ - "https://hive/data/seqtrim/557007.fasta" - ] - }, - { - "tool_name": "HIVE-MAFFT", - "tool_desc": "Multiple sequence alignment tool", - "tool_version": "1.4.3", - "tool_package": "HIVE", - "step_number": "3", - "input_uri_list": [ - "https://hive/data/seqtrim/557007.fasta" - ], - "output_uri_list": [ - "https://hive/data/MAFFT/557008.fasta" - ] - }, - { - "tool_name": "HIVE-hexagon", - "tool_desc": "Aligns reads to the reference", - "tool_version": "1.4.3", - "tool_package": "HIVE", - "step_number": "4", - "input_uri_list": [ - "https://hive/data/MAFFT/557008.fasta" - ], - "output_uri_list": [ - "https://hive/data/hexagon/557009/allcounts-aligned.csv" - ] - }, - { - "tool_name": "HIVE-popolation_analysis", - "tool_desc": "clone discovery tool", - "tool_version": "1.4.3", - "tool_package": "HIVE", - "step_number": "5", - "input_uri_list": [ - "https://hive/data/hexagon/557009/allcounts-aligned.csv" - ], - "output_uri_list": [ - "https://hive/data/557010/popContig.fasta" - ] - } - ] - }, - "execution_domain": { - "script": "hive://workflows/quasispecies_analysis_of_HIV-1_viruses_hive.py", - "pipeline_version": "1.0", - "platform": "HIVE", - "driver": "//hive.biochemistry.gwu.edu/hive-driver", - "prerequisites": [ - "HIVE-hexagon, version1.4.3", - "HIVE-seq, version1.4.3", - "HIVE-MAFFT, version1.4.3", - "HIVE-hexagon, version1.4.3", - "HIVE-population analysis, version1.4.3" - ], - "env_parameters": [ - "HIVEv1.4.3" - ], - "script_type": "URI" - }, - "parametric_domain": { - "hexagon_seed": "0.01" - }, - "io_domain": { - "reference_uri_list": [ - "hive://genomes/333333" - ], - "input_uri_list": [ - "hive://nuc-read/111111", - "hive://nuc-read/222222", - "hive://genomes/333333" - ], - "output_uri_list": [ - "hive://data/888888/popContig ", - "hive://data/888888/popAlignment ", - "hive://data/888888/popCoverage ", - "hive://data/888888/popBreakpoints ", - "hive://data/888888/popSummary" - ] - } + "id": "obj.1283", + "name": "Quasispecies analysis of HIV-1 [taxID:12721] samples taken from H1V-1 infected individuals [taxID:9606] and detection of novel HIV-1 recombinant subtypes", + "structured_name": "Quasispecies analysis of HIV-1", + "version": "1.0", + "createdby": "naila_gulzar@gwu.edu", + "created": "Feb 02, 2017 14:37:50", + "modified": "Jun 16, 2017 15:04:43", + "digital_signature": "324kjhgiufgioerfhlsjdhbfalskjd", + "verification_status": "unreviewed", + "publication_status": "draft", + "usability_domain": [ + "This pipeline can be used for discovering the new recombinant subtypes of viruses" + ], + "authors": [ + { + "orcid": "0000-0002-8138-8312" + }, + { + "name": "Bhavna Hora" + }, + { + "name": "Konstantinos Karagiannis " + }, + { + "name": "Krista Smith" + }, + { + "name": "Raja Mazumder" + }, + { + "name": "Feng Gao" + } + ], + "description_domain": { + "keywords": [ + "clonal discovery", + "quasispecies", + "clonal contigs", + "HIV-1" + ], + "xref": [ + "UBERON:0001969", + "taxID:9606", + "taxID:12721" + ], + "pipeline_steps": [ + { + "tool_name": "HIVE-hexagon", + "tool_desc": "Aligns reads to a set of reference", + "tool_version": "1.4.3", + "tool_package": "HIVE", + "step_number": "1", + "input_uri_list": [ + "https://hive/data/nuc-read/645175", + "https://hive/data/nuc-read/645176", + "https://hive/data/nuc-genome/545173" + ], + "output_uri_list": [ + "https://hive/data/hitlist/557006.csv" + ] + }, + { + "tool_name": "HIve-seq", + "tool_desc": "Utility set that allows sequence manipulation", + "tool_version": "1.4.3", + "tool_package": "HIVE", + "step_number": "2", + "input_uri_list": [ + "https://hive/data/hitlist/557006.csv" + ], + "output_uri_list": [ + "https://hive/data/seqtrim/557007.fasta" + ] + }, + { + "tool_name": "HIVE-MAFFT", + "tool_desc": "Multiple sequence alignment tool", + "tool_version": "1.4.3", + "tool_package": "HIVE", + "step_number": "3", + "input_uri_list": [ + "https://hive/data/seqtrim/557007.fasta" + ], + "output_uri_list": [ + "https://hive/data/MAFFT/557008.fasta" + ] + }, + { + "tool_name": "HIVE-hexagon", + "tool_desc": "Aligns reads to the reference", + "tool_version": "1.4.3", + "tool_package": "HIVE", + "step_number": "4", + "input_uri_list": [ + "https://hive/data/MAFFT/557008.fasta" + ], + "output_uri_list": [ + "https://hive/data/hexagon/557009/allcounts-aligned.csv" + ] + }, + { + "tool_name": "HIVE-popolation_analysis", + "tool_desc": "clone discovery tool", + "tool_version": "1.4.3", + "tool_package": "HIVE", + "step_number": "5", + "input_uri_list": [ + "https://hive/data/hexagon/557009/allcounts-aligned.csv" + ], + "output_uri_list": [ + "https://hive/data/557010/popContig.fasta" + ] + } + ] + }, + "execution_domain": { + "script": "hive://workflows/quasispecies_analysis_of_HIV-1_viruses_hive.py", + "pipeline_version": "1.0", + "platform": "HIVE", + "driver": "//hive.biochemistry.gwu.edu/hive-driver", + "software_prerequisites": [ + { + "name": "HIVE-hexagon", + "version": "1.4.3" + }, + { + "name": "HIVE-seq", + "version": "1.4.3" + }, + { + "name": "HIVE-MAFFT", + "version": "1.4.3" + }, + { + "name": "HIVE-hexagon", + "version": "1.4.3" + }, + { + "name": "HIVE-population analysis", + "version": "1.4.3" + } + ], + "domain_prerequisites": [ + { + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/", + "name": "access to e-utils" + }, + { + "url": "ftp://:22/", + "name": "access to ftp" + } + ], + "env_parameters": [ + "HIVEv1.4.3" + ], + "script_type": "URI" + }, + "parametric_domain": { + "hexagon_seed": "0.01" + }, + "io_domain": { + "reference_uri_list": [ + "hive://genomes/333333" + ], + "input_uri_list": [ + "hive://nuc-read/111111", + "hive://nuc-read/222222", + "hive://genomes/333333" + ], + "output_uri_list": [ + "hive://data/888888/popContig ", + "hive://data/888888/popAlignment ", + "hive://data/888888/popCoverage ", + "hive://data/888888/popBreakpoints ", + "hive://data/888888/popSummary" + ] + } } \ No newline at end of file diff --git a/7_EGFR/bcoexample_7.json b/7_EGFR/bcoexample_7.json index 1748bff..5b186b5 100644 --- a/7_EGFR/bcoexample_7.json +++ b/7_EGFR/bcoexample_7.json @@ -1,148 +1,162 @@ { - "id": "obj.1300", - "name": "Detection of EGFR [HGNC:3236] gene mutations in human [taxID:9606] non-small cell lung carcinoma [DOID:3908] patients", - "title": "EGFR mutation detection", - "version": "1.0", - "createdby": "jeetvora@gwmail.gwu.edu", - "created": "May 24, 2017 10:52:06", - "modified": "Jun 16, 2017 13:59:11", - "digital_signature": "QwErTy0ab07BocdNdXyZGwU", - "verification_status": "unreviewed", - "publication_status": "draft", - "usability_domain": [ - "Mutation analysis of epidermal growth factor receptor gene [HGNC:3236] in human [taxID:9606] non-small cell lung carcinoma [DOID:3908] patients", - "Amino acid substitutions [SO:0001606] detection in epidermal growth factor receptor [UniProtKB:P00533] in human [taxID:9606] non-small cell lung carcinoma [DOID:3908] samples ", - "Single-point substitution mutation L858R (c.2573T>G) [VariO:0136], [VariO:0316], [dbSNP:rs121434568] [ClinVar:16609] in exon 21 is the most frequent in NSCLC and is termed as classical mutation. The point mutation T790M (c.2369C>T) [VariO:0136], [VariO:0313] [dbSNP:rs121434569] [ClinVar: 16613] accounts for resistances of antineoplastic agents such as gefitinib [PubChem:123631] and erlotinib [PubChem: 176870] resistances in about one-half of the case", - "https://www.mycancergenome.org/content/disease/lung-cancer/egfr" - ], - "authors": [ - { - "orcid": "0000-0002-5317-1458" - }, - { - "name": "Jeet Vora" - } - ], - "description_domain": { - "keywords": [ - "EGFR", - "lung cancer", - "Non-small cell lung carcinoma" - ], - "xref": [ - "HGNC:3236 ", - "UniProtKB:P00533 ", - "taxID:9606 ", - "DOID:3908", - "SO:0001606", - "SO:0001583", - "VariO:0136", - "VariO:0316", - "VariO:0313", - "dbSNP:rs121434568", - "dbSNP:rs121434569 ", - "ClinVar:16609", - "ClinVar:16613", - "PubChem:123631", - "PubChem: 176870", - "PMID:19680293" - ], - "pipeline_steps": [ - { - "tool_name": "HIVE-hexagon", - "tool_desc": "HIVE-hexagon is an alignment algorithm tool that finds short read alignments by seeding, extension and optimal alignment", - "tool_version": "1.4.3", - "tool_package": "HIVE", - "step_number": "1", - "input_uri_list": [ - "https://www.ncbi.nlm.nih.gov/nuccore/399923581", - "https://hive.biochemistry.gwu.edu/nuc-read/557363", - "https://hive.biochemistry.gwu.edu/nuc-read/557364" - ], - "output_uri_list": [ - "https://hive.biochemistry.gwu/data/557365/allCount-aligned.csv" - ] - }, - { - "tool_name": "HIVE-heptagon", - "tool_desc": "HIVE-heptagon is a profiler that performs variant calling for a previously computed alignment and provides quality and noise assessment profiles", - "tool_version": "1.4.3", - "tool_package": "HIVE", - "step_number": "2", - "input_uri_list": [ - "https://hive.biochemistry.gwu/data/557365/allCount-aligned.csv" - ], - "output_uri_list": [ - "https://hive.biochemistry.gwu/data/557367/SNPprofile.csv" - ] - } - ] - }, - "execution_domain": { - "script": "https://hive/workflows/EGFR_mutation_detection_hive.py", - "pipeline_version": "1.2", - "platform": "HIVEv1.4.3", - "driver": "driver: shell", - "prerequisites": [ - "name:HIVE-hexagon", - "version:1.3 ", - "name:HIVE-heptagon", - "version:1.3" - ], - "env_parameters": [ - "{\"OSTYPE\":\"linux\"}", - "{\"QPRIDE_BIN\":\"~qpride/bin\"}" - ], - "access_prerequisites": [ - "url:http://eutils.ncbi.nlm.nih.gov/entrez/eutils/399923581", - "name:access to e-utils" - ], - "script_type": "URI" - }, - "parametric_domain": { - "Hexagon_mismatch_percent_allowed": "15", - "Hexagon_shtrdfilt_window_size": "30", - "Hexagon_matches_to_keep": "Random vote between equally best alternative matches", - "Hexagon_referencemasking_minimal_shannons_entropy": "1-Lenient", - "Hexagon_min_match_length": "45", - "Hexagon_referencemasking_window_size": "30", - "Hexagon_referencemasking_filterNN_lowquality": "Filter NNs only", - "Hexagon_shrdfilt_minimal_shannons_entropy": "1.6-Strict", - "Heptagon_minimal_coverage_allowed": "10" - }, - "io_domain": { - "reference_uri_list": [ - "https://www.ncbi.nlm.nih.gov/nuccore/399923581" - ], - "input_uri_list": [ - "https://hive.biochemistry.gwu.edu/nuc-read/557363", - "https://hive.biochemistry.gwu.edu/nuc-read/557364", - "https://hive.biochemistry.gwu.edu/data/557365/allCount-aligned.csv" - ], - "output_uri_list": [ - "https://hive.biochemistry.gwu.edu/data/557367/SNPProfile.csv", - "https://hive.biochemistry.gwu.edu/data/557365/allCount-aligned.csv" - ], - "output_subdomain": [ - "hit_list: ", - "title: hit list", - "uri:https://hive.biochemistry.gwu.edu/data/557365/allCount-aligned.csv", - "mime-type:csv", - "SNP_profile", - "title: snp profile", - "uri: https://hive.biochemistry.gwu/data/557367/SNPprofile.csv", - "mime-type: csv" - ], - "input_subdomain": [ - "Read Files for alignment", - "https://hive.biochemistry.gwu.edu/nuc-read/557363", - "https://hive.biochemistry.gwu.edu/nuc-read/557364", - "EGFR Gene", - "https://www.ncbi.nlm.nih.gov/nuccore/399923581" - ] - }, - "error_domain": [ - "false negative alignment hits < 0.0010", - "false positive mutation calls discovery < 0.0005" - ] + "id": "obj.1300", + "name": "Detection of EGFR [HGNC:3236] gene mutations in human [taxID:9606] non-small cell lung carcinoma [DOID:3908] patients", + "title": "EGFR mutation detection", + "version": "1.0", + "createdby": "jeetvora@gwmail.gwu.edu", + "created": "May 24, 2017 10:52:06", + "modified": "Jun 16, 2017 13:59:11", + "digital_signature": "QwErTy0ab07BocdNdXyZGwU", + "verification_status": "unreviewed", + "publication_status": "draft", + "usability_domain": [ + "Mutation analysis of epidermal growth factor receptor gene [HGNC:3236] in human [taxID:9606] non-small cell lung carcinoma [DOID:3908] patients", + "Amino acid substitutions [SO:0001606] detection in epidermal growth factor receptor [UniProtKB:P00533] in human [taxID:9606] non-small cell lung carcinoma [DOID:3908] samples ", + "Single-point substitution mutation L858R (c.2573T>G) [VariO:0136], [VariO:0316], [dbSNP:rs121434568] [ClinVar:16609] in exon 21 is the most frequent in NSCLC and is termed as classical mutation. The point mutation T790M (c.2369C>T) [VariO:0136], [VariO:0313] [dbSNP:rs121434569] [ClinVar: 16613] accounts for resistances of antineoplastic agents such as gefitinib [PubChem:123631] and erlotinib [PubChem: 176870] resistances in about one-half of the case", + "https://www.mycancergenome.org/content/disease/lung-cancer/egfr" + ], + "authors": [ + { + "orcid": "0000-0002-5317-1458" + }, + { + "name": "Jeet Vora" + } + ], + "description_domain": { + "keywords": [ + "EGFR", + "lung cancer", + "Non-small cell lung carcinoma" + ], + "xref": [ + "HGNC:3236 ", + "UniProtKB:P00533 ", + "taxID:9606 ", + "DOID:3908", + "SO:0001606", + "SO:0001583", + "VariO:0136", + "VariO:0316", + "VariO:0313", + "dbSNP:rs121434568", + "dbSNP:rs121434569 ", + "ClinVar:16609", + "ClinVar:16613", + "PubChem:123631", + "PubChem: 176870", + "PMID:19680293" + ], + "pipeline_steps": [ + { + "tool_name": "HIVE-hexagon", + "tool_desc": "HIVE-hexagon is an alignment algorithm tool that finds short read alignments by seeding, extension and optimal alignment", + "tool_version": "1.4.3", + "tool_package": "HIVE", + "step_number": "1", + "input_uri_list": [ + "https://www.ncbi.nlm.nih.gov/nuccore/399923581", + "https://hive.biochemistry.gwu.edu/nuc-read/557363", + "https://hive.biochemistry.gwu.edu/nuc-read/557364" + ], + "output_uri_list": [ + "https://hive.biochemistry.gwu/data/557365/allCount-aligned.csv" + ] + }, + { + "tool_name": "HIVE-heptagon", + "tool_desc": "HIVE-heptagon is a profiler that performs variant calling for a previously computed alignment and provides quality and noise assessment profiles", + "tool_version": "1.4.3", + "tool_package": "HIVE", + "step_number": "2", + "input_uri_list": [ + "https://hive.biochemistry.gwu/data/557365/allCount-aligned.csv" + ], + "output_uri_list": [ + "https://hive.biochemistry.gwu/data/557367/SNPprofile.csv" + ] + } + ] + }, + "execution_domain": { + "script": "https://hive/workflows/EGFR_mutation_detection_hive.py", + "pipeline_version": "1.2", + "platform": "HIVEv1.4.3", + "driver": "driver: shell", + "software_prerequisites": [ + { + "name": "HIVE_hexagon", + "version": "1.3" + }, + { + "name": "HIVE_heptagon", + "version": "1.3" + } + ], + "domain_prerequisites": [ + { + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/", + "name": "access to e-utils" + }, + { + "url": "ftp://:22/", + "name": "access to ftp" + } + ], + "env_parameters": [ + "{\"OSTYPE\":\"linux\"}", + "{\"QPRIDE_BIN\":\"~qpride/bin\"}" + ], + "access_prerequisites": [ + "url:http://eutils.ncbi.nlm.nih.gov/entrez/eutils/399923581", + "name:access to e-utils" + ], + "script_type": "URI" + }, + "parametric_domain": { + "Hexagon_mismatch_percent_allowed": "15", + "Hexagon_shtrdfilt_window_size": "30", + "Hexagon_matches_to_keep": "Random vote between equally best alternative matches", + "Hexagon_referencemasking_minimal_shannons_entropy": "1-Lenient", + "Hexagon_min_match_length": "45", + "Hexagon_referencemasking_window_size": "30", + "Hexagon_referencemasking_filterNN_lowquality": "Filter NNs only", + "Hexagon_shrdfilt_minimal_shannons_entropy": "1.6-Strict", + "Heptagon_minimal_coverage_allowed": "10" + }, + "io_domain": { + "reference_uri_list": [ + "https://www.ncbi.nlm.nih.gov/nuccore/399923581" + ], + "input_uri_list": [ + "https://hive.biochemistry.gwu.edu/nuc-read/557363", + "https://hive.biochemistry.gwu.edu/nuc-read/557364", + "https://hive.biochemistry.gwu.edu/data/557365/allCount-aligned.csv" + ], + "output_uri_list": [ + "https://hive.biochemistry.gwu.edu/data/557367/SNPProfile.csv", + "https://hive.biochemistry.gwu.edu/data/557365/allCount-aligned.csv" + ], + "output_subdomain": [ + "hit_list: ", + "title: hit list", + "uri:https://hive.biochemistry.gwu.edu/data/557365/allCount-aligned.csv", + "mime-type:csv", + "SNP_profile", + "title: snp profile", + "uri: https://hive.biochemistry.gwu/data/557367/SNPprofile.csv", + "mime-type: csv" + ], + "input_subdomain": [ + "Read Files for alignment", + "https://hive.biochemistry.gwu.edu/nuc-read/557363", + "https://hive.biochemistry.gwu.edu/nuc-read/557364", + "EGFR Gene", + "https://www.ncbi.nlm.nih.gov/nuccore/399923581" + ] + }, + "error_domain": [ + "false negative alignment hits < 0.0010", + "false positive mutation calls discovery < 0.0005" + ] } \ No newline at end of file diff --git a/8_gatk_Hemophilia/.DS_Store b/8_gatk_Hemophilia/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/8_gatk_Hemophilia/.DS_Store differ diff --git a/8_gatk_Hemophilia/bcoexample_7.json b/8_gatk_Hemophilia/bcoexample_7.json deleted file mode 100644 index 5caec86..0000000 --- a/8_gatk_Hemophilia/bcoexample_7.json +++ /dev/null @@ -1,360 +0,0 @@ -{ - "id": "obj.1298", - "name": "Identification of recombinant antihemophilic factor VII [UniProt:P00451] inhibitor SNPs [SO:0000694] in human [taxID:9606] blood [UBERON:0000178] extracted from patients with hemophilia A [DOID:12134]", - "title": "Identification of recombinant antihemophilic factor", - "version": "1.0", - "createdby": "hadley_king@gwmail.gwu.edu", - "created": "Mar 12, 2017 16:50:32", - "modified": "Jun 15, 2017 11:43:35", - "digital_signature": "as243hgdfbvsh345354jnjhjfdf", - "verification_status": "unreviewed", - "publication_status": "draft", - "usability_domain": [ - "Identify SNPs [SO:0000694] that correlate with the development of an inhibitory response to recombinant antihemophilic factor VII [UniProt:P00451] in patients with hemophilia A [DOID:12134]" - ], - "authors": [ - { - "orcid": "0000-0003-1409-4549" - }, - { - "name": "Joe Mcgill" - } - ], - "description_domain": { - "keywords": [ - "hemophilia A", - "recombinant antihemophilic factor VII", - "SNPs" - ], - "xref": [ - "taxID:9606 ", - "UBERON:0000178 ", - "DB:00025 ", - "SO:0000694", - "UniProt:P00451" - ], - "pipeline_steps": [ - { - "tool_name": "samtools_faidx", - "tool_desc": "Index reference sequence in the FASTA format", - "tool_version": "0.1.19-96b5f2294a", - "tool_package": "", - "step_number": "1", - "input_uri_list": [ - "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" - ], - "output_uri_list": [ - "./biocompute/human_g1k_v37.fasta" - ] - }, - { - "tool_name": "picard_CreateSequenceDictionary", - "tool_desc": "Creates a sequence dictionary for a reference sequence", - "tool_version": "v2.6.0", - "tool_package": "", - "step_number": "2", - "input_uri_list": [ - "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" - ], - "output_uri_list": [ - "home/jmcgill/Desktop/biocompute/human_g1k_v37.dict" - ] - }, - { - "tool_name": "samtools_sort", - "tool_desc": "Sort alignments by leftmost coordinates", - "tool_version": "", - "tool_package": "", - "step_number": "3", - "input_uri_list": [ - "./bam_files/biocompute/B_S30.bam", - "./bam_files/biocompute/C_S31.bam", - "./bam_files/biocompute/D_S32.bam", - "./bam_files/biocompute/Hai003_S3.bam", - "./bam_files/biocompute/Hai004_S4.bam", - "./bam_files/biocompute/Hai006_S6.bam", - "./bam_files/biocompute/Hai007_S7.bam", - "./bam_files/biocompute/Hai012_S41.bam", - "./bam_files/biocompute/Hawi010_S16.bam", - "./bam_files/biocompute/Hawi015_S21.bam", - "./bam_files/biocompute/Hawi028_S35.bam", - "./bam_files/biocompute/Hawi032_S39.bam" - ], - "output_uri_list": [ - "/home/biocompute/B_S30.bam_sorted", - "/home/biocompute/C_S31.bam_sorted", - "/home/biocompute/D_S32.bam_sorted", - "/home/biocompute/Hai003_S3.bam_sorted", - "/home/biocompute/Hai004_S4.bam_sorted", - "/home/biocompute/Hai006_S6.bam_sorted", - "/home/biocompute/Hai007_S7.bam_sorted", - "/home/biocompute/Hai012_S41.bam_sorted", - "/home/biocompute/Hawi010_S16.bam_sorted", - "/home/biocompute/Hawi015_S21.bam_sorted", - "/home/biocompute/Hawi028_S35.bam_sorted", - "/home/biocompute/Hawi032_S39.bam_so" - ] - }, - { - "tool_name": "picard_MarkDuplicates", - "tool_desc": "This tool locates and tags duplicate reads in a BAM or SAM file", - "tool_version": "0.1.19-96b5f2294a", - "tool_package": "", - "step_number": "4", - "input_uri_list": [ - "B_sorted.bam", - "C_sorted.bam", - "D_sorted.bam", - "Hai003_sorted.bam", - "Hai004_sorted.bam", - "Hai006_sorted.bam", - "Hai007_sorted.bam", - "Hai012_sorted.bam", - "Hawi010_sorted.bam", - "Hawi015_sorted.bam", - "Hawi028_sorted.bam", - "Hawi032_sorted.bam" - ], - "output_uri_list": [ - "new_B_mark_duplicates.bam", - "new_C_mark_duplicates.bam", - "new_D_mark_duplicates.bam", - "new_Hai003_mark_duplicates.bam", - "new_Hai004_mark_duplicates.bam", - "new_Hai006_mark_duplicates.bam", - "new_Hai007_mark_duplicates.bam", - "new_Hai012_mark_duplicates.bam", - "new_Hawi010_mark_duplicates.bam", - "new_Hawi015_mark_duplicates.bam", - "new_Hawi028_mark_duplicates.bam", - "new_Hawi032_mark_duplicates.bam", - "B.txt", - "C.txt", - "D.txt", - "Hai003.txt", - "Hai004.txt", - "Hai006.txt", - "Hai007.txt", - "Hai012.txt", - "Hawi010.txt", - "Hawi015.txt", - "Hawi028.txt", - "Hawi032.txt" - ] - }, - { - "tool_name": "picard_AddOrReplaceReadGroups", - "tool_desc": "Replace read groups in a BAM file", - "tool_version": "0.1.19-96b5f2294a", - "tool_package": "", - "step_number": "5", - "input_uri_list": [ - "new_B_mark_duplicates.bam", - "new_C_mark_duplicates.bam", - "new_D_mark_duplicates.bam", - "new_Hai003_mark_duplicates.bam", - "new_Hai004_mark_duplicates.bam", - "new_Hai006_mark_duplicates.bam", - "new_Hai007_mark_duplicates.bam", - "new_Hai012_mark_duplicates.bam", - "new_Hawi010_mark_duplicates.bam", - "new_Hawi015_mark_duplicates.bam", - "new_Hawi028_mark_duplicates.bam", - "new_Hawi032_mark_duplicates.bam" - ], - "output_uri_list": [ - "with_header_new_B_mark_duplicates.bam", - "with_header_new_C_mark_duplicates.bam", - "with_header_new_D_mark_duplicates.bam", - "with_header_new_Hai003_mark_duplicates.bam", - "with_header_new_Hai004_mark_duplicates.bam", - "with_header_new_Hai006_mark_duplicates.bam", - "with_header_new_Hai007_mark_duplicates.bam", - "with_header_new_Hai012_mark_duplicates.bam", - "with_header_new_Hawi010_mark_duplicates.bam", - "with_header_new_Hawi015_mark_duplicates.bam", - "with_header_new_Hawi028_mark_duplicates.bam", - "with_header_new_Hawi032_mark_duplicates.bam" - ] - }, - { - "tool_name": "GenomeAnalysisTK_RealignerTargetCreator", - "tool_desc": "Determining (small) suspicious intervals which are likely in need of realignment", - "tool_version": "3.7-0-gcfedb67", - "tool_package": "", - "step_number": "6", - "input_uri_list": [ - "./biocompute/human_g1k_v37.fasta", - "with_header_new_B_mark_duplicates.bam", - "with_header_new_C_mark_duplicates.bam", - "with_header_new_D_mark_duplicates.bam", - "with_header_new_Hai003_mark_duplicates.bam", - "with_header_new_Hai004_mark_duplicates.bam", - "with_header_new_Hai006_mark_duplicates.bam", - "with_header_new_Hai007_mark_duplicates.bam", - "with_header_new_Hai012_mark_duplicates.bam", - "with_header_new_Hawi010_mark_duplicates.bam", - "with_header_new_Hawi015_mark_duplicates.bam", - "with_header_new_Hawi028_mark_duplicates.bam", - "with_header_new_Hawi032_mark_duplicates.bam" - ], - "output_uri_list": [ - "realignedB.list", - "realignedC.list", - "realignedD.list", - "realignedHai003.list", - "realignedHai004.list", - "realignedHai006.list", - "realignedHai007.list", - "realignedHai012.list", - "realignedHawi010.list", - "realignedHawi015.list", - "realignedHawi028.list", - "realignedHawi032.list" - ] - }, - { - "tool_name": "GenomeAnalysisTK_IndelRealigner", - "tool_desc": "Perform local realignment of reads around indels", - "tool_version": "3.7-0-gcfedb67", - "tool_package": "", - "step_number": "7", - "input_uri_list": [ - "./biocompute/human_g1k_v37.fasta", - "with_header_new_B_mark_duplicates.bam-targetIntervals", - "with_header_new_C_mark_duplicates.bam-targetIntervals", - "with_header_new_D_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai003_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai004_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai006_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai007_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai012_mark_duplicates.bam-targetIntervals", - "with_header_new_Hawi010_mark_duplicates.bam-targetIntervals", - "with_header_new_Hawi015_mark_duplicates.bam-targetIntervals", - "with_header_new_Hawi028_mark_duplicates.bam-targetIntervals", - "with_header_new_Hawi032_mark_duplicates.bam-targetIntervals", - "realignedB.list", - "realignedC.list", - "realignedD.list", - "realignedHai003.list", - "realignedHai004.list", - "realignedHai006.list", - "realignedHai007.list", - "realignedHai012.list", - "realignedHawi010.list", - "realignedHawi015.list", - "realignedHawi028.list", - "realignedHawi032.list" - ], - "output_uri_list": [ - "B_realigned_reads.bam", - "C_realigned_reads.bam", - "D_realigned_reads.bam", - "Hai003_realigned_reads.bam", - "Hai004_realigned_reads.bam", - "Hai006_realigned_reads.bam", - "Hai007_realigned_reads.bam", - "Hai012_realigned_reads.bam", - "Hawi010_realigned_reads.bam", - "Hawi015_realigned_reads.bam", - "Hawi028_realigned_reads.bam", - "Hawi032_realigned_reads.bam" - ] - }, - { - "tool_name": "GenomeAnalysisTK_HaplotypeCaller", - "tool_desc": "Perform local realignment of reads around indels", - "tool_version": "3.7-0-gcfedb67", - "tool_package": "", - "step_number": "8", - "input_uri_list": [ - "B_realigned_reads.bam", - "C_realigned_reads.bam", - "D_realigned_reads.bam", - "Hai003_realigned_reads.bam", - "Hai004_realigned_reads.bam", - "Hai006_realigned_reads.bam", - "Hai007_realigned_reads.bam", - "Hai012_realigned_reads.bam", - "Hawi010_realigned_reads.bam", - "Hawi015_realigned_reads.bam", - "Hawi028_realigned_reads.bam", - "Hawi032_realigned_reads.bam" - ], - "output_uri_list": [ - "B_gatk_raw_snps.vcf", - "C_gatk_raw_snps.vcf", - "D_gatk_raw_snps.vcf", - "Hai003_gatk_raw_snps.vcf", - "Hai004_gatk_raw_snps.vcf", - "Hai006_gatk_raw_snps.vcf", - "Hai007_gatk_raw_snps.vcf", - "Hai012_gatk_raw_snps.vcf", - "Hawi010_gatk_raw_snps.vcf", - "Hawi015_gatk_raw_snps.vcf", - "Hawi028_gatk_raw_snps.vcf", - "Hawi032_gatk_raw_snps.vcf" - ] - } - ] - }, - "execution_domain": { - "script": "https://github.com/biocompute-objects//HTS-CSRS/tree/master/8_gatk_Hemophilia/make_vcfs.py", - "pipeline_version": "1.0", - "platform": "linux", - "driver": "Python 2.7.10", - "prerequisites": [ - "name:GenomAnalysisToolKit", - "version:3.6", - "name:Picard", - "version:v2.6.0", - "name:samtools", - "version:v0.1.19-96b5f2294a", - "name:Python", - "version:2.7.10" - ], - "env_parameters": [ - "10 GB memory required" - ], - "script_type": "URI" - }, - "parametric_domain": { - "picard_MarkDuplicates_REMOVE_DUPLICATES": "no", - "picard_AddOrReplaceReadGroups_ReadGroupPlatform": "illumina", - "GenomeAnalysisTK_HaplotypeCaller_Intervals": "X", - "GenomeAnalysisTK_RealignerTargetCreator_Num_threads": "22" - }, - "io_domain": { - "reference_uri_list": [ - "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" - ], - "input_uri_list": [ - "B_gatk_raw_snps.vcf", - "C_gatk_raw_snps.vcf", - "D_gatk_raw_snps.vcf", - "Hai003_gatk_raw_snps.vcf", - "Hai004_gatk_raw_snps.vcf", - "Hai006_gatk_raw_snps.vcf", - "Hai007_gatk_raw_snps.vcf", - "Hai012_gatk_raw_snps.vcf", - "Hawi010_gatk_raw_snps.vcf", - "Hawi015_gatk_raw_snps.vcf", - "Hawi028_gatk_raw_snps.vcf", - "Hawi032_gatk_raw_snps.vcf" - ], - "output_uri_list": [ - "./B_S30.bam", - "./C_S31.bam", - "./D_S32.bam", - "./Hai003_S3.bam", - "./Hai004_S4.bam", - "./Hai006_S6.bam", - "./Hai007_S7.bam", - "./Hai012_S41.bam", - "./Hawi010_S16.bam", - "./Hawi015_S21.bam", - "./Hawi028_S35.bam", - "./Hawi032_S39.bam" - ] - } -} \ No newline at end of file diff --git a/8_gatk_Hemophilia/bcoexample_8.json b/8_gatk_Hemophilia/bcoexample_8.json index d277fb6..3b0d2e5 100644 --- a/8_gatk_Hemophilia/bcoexample_8.json +++ b/8_gatk_Hemophilia/bcoexample_8.json @@ -1,353 +1,374 @@ { - "id": "obj.1298", - "name": "Identification of recombinant antihemophilic factor [Uniprot:P00451] inhibitor SNPs [SO:0000694] in human [taxID:9606] blood [UBERON:0000178} extracted from patients with Hemophilia A [DOID:12134]", - "version": "1.0", - "createdby": "hadley_king@gwmail.gwu.edu", - "created": "Mar 12, 2017 16:50:32", - "modified": "Mar 12, 2017 19:01:02", - "verification_status": "unreviewed", - "publication_status": "draft", - "usability_domain": [ - "Identify SNPs [SO:0000694] that corrilate with the development of an inhibitory response to recombinant antihemophilic factor VII in patiens with hemophilia A [DOID:12134]" - ], - "authors": [ - { - "orcid": "0000-0003-1409-4549" - }, - { - "name": "Joe Mcgill" - } - ], - "description_domain": { - "keywords": [ - "hemophilia A", - "recombinant antihemophilic factor VII", - "SNPs" - ], - "xref": [ - "taxID:9606 ", - "Uniprot:P00451 ", - "UBERON:0000178 ", - "DB:00025 ", - "SO:0000694" - ], - "pipeline_steps": [ - { - "tool_name": "samtools_faidx", - "tool_desc": "Index reference sequence in the FASTA format", - "tool_version": "0.1.19-96b5f2294a", - "tool_package": "", - "step_number": "1", - "input_uri_list": [ - "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" - ], - "output_uri_list": [ - "./biocompute/human_g1k_v37.fasta" - ] - }, - { - "tool_name": "picard_CreateSequenceDictionary", - "tool_desc": "Creates a sequence dictionary for a reference sequence", - "tool_version": "v2.6.0", - "tool_package": "", - "step_number": "2", - "input_uri_list": [ - "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" - ], - "output_uri_list": [ - "home/jmcgill/Desktop/biocompute/human_g1k_v37.dict" - ] - }, - { - "tool_name": "samtools_sort", - "tool_desc": "Sort alignments by leftmost coordinates", - "tool_version": "", - "tool_package": "", - "step_number": "3", - "input_uri_list": [ - "./bam_files/biocompute/B_S30.bam", - "./bam_files/biocompute/C_S31.bam", - "./bam_files/biocompute/D_S32.bam", - "./bam_files/biocompute/Hai003_S3.bam", - "./bam_files/biocompute/Hai004_S4.bam", - "./bam_files/biocompute/Hai006_S6.bam", - "./bam_files/biocompute/Hai007_S7.bam", - "./bam_files/biocompute/Hai012_S41.bam", - "./bam_files/biocompute/Hawi010_S16.bam", - "./bam_files/biocompute/Hawi015_S21.bam", - "./bam_files/biocompute/Hawi028_S35.bam", - "./bam_files/biocompute/Hawi032_S39.bam" - ], - "output_uri_list": [ - "/home/biocompute/B_S30.bam_sorted", - "/home/biocompute/C_S31.bam_sorted", - "/home/biocompute/D_S32.bam_sorted", - "/home/biocompute/Hai003_S3.bam_sorted", - "/home/biocompute/Hai004_S4.bam_sorted", - "/home/biocompute/Hai006_S6.bam_sorted", - "/home/biocompute/Hai007_S7.bam_sorted", - "/home/biocompute/Hai012_S41.bam_sorted", - "/home/biocompute/Hawi010_S16.bam_sorted", - "/home/biocompute/Hawi015_S21.bam_sorted", - "/home/biocompute/Hawi028_S35.bam_sorted", - "/home/biocompute/Hawi032_S39.bam_so" - ] - }, - { - "tool_name": "picard_MarkDuplicates", - "tool_desc": "This tool locates and tags duplicate reads in a BAM or SAM file", - "tool_version": "0.1.19-96b5f2294a", - "tool_package": "", - "step_number": "4", - "input_uri_list": [ - "B_sorted.bam", - "C_sorted.bam", - "D_sorted.bam", - "Hai003_sorted.bam", - "Hai004_sorted.bam", - "Hai006_sorted.bam", - "Hai007_sorted.bam", - "Hai012_sorted.bam", - "Hawi010_sorted.bam", - "Hawi015_sorted.bam", - "Hawi028_sorted.bam", - "Hawi032_sorted.bam" - ], - "output_uri_list": [ - "new_B_mark_duplicates.bam", - "new_C_mark_duplicates.bam", - "new_D_mark_duplicates.bam", - "new_Hai003_mark_duplicates.bam", - "new_Hai004_mark_duplicates.bam", - "new_Hai006_mark_duplicates.bam", - "new_Hai007_mark_duplicates.bam", - "new_Hai012_mark_duplicates.bam", - "new_Hawi010_mark_duplicates.bam", - "new_Hawi015_mark_duplicates.bam", - "new_Hawi028_mark_duplicates.bam", - "new_Hawi032_mark_duplicates.bam", - "B.txt", - "C.txt", - "D.txt", - "Hai003.txt", - "Hai004.txt", - "Hai006.txt", - "Hai007.txt", - "Hai012.txt", - "Hawi010.txt", - "Hawi015.txt", - "Hawi028.txt", - "Hawi032.txt" - ] - }, - { - "tool_name": "picard_AddOrReplaceReadGroups", - "tool_desc": "Replace read groups in a BAM file", - "tool_version": "0.1.19-96b5f2294a", - "tool_package": "", - "step_number": "5", - "input_uri_list": [ - "new_B_mark_duplicates.bam", - "new_C_mark_duplicates.bam", - "new_D_mark_duplicates.bam", - "new_Hai003_mark_duplicates.bam", - "new_Hai004_mark_duplicates.bam", - "new_Hai006_mark_duplicates.bam", - "new_Hai007_mark_duplicates.bam", - "new_Hai012_mark_duplicates.bam", - "new_Hawi010_mark_duplicates.bam", - "new_Hawi015_mark_duplicates.bam", - "new_Hawi028_mark_duplicates.bam", - "new_Hawi032_mark_duplicates.bam" - ], - "output_uri_list": [ - "with_header_new_B_mark_duplicates.bam", - "with_header_new_C_mark_duplicates.bam", - "with_header_new_D_mark_duplicates.bam", - "with_header_new_Hai003_mark_duplicates.bam", - "with_header_new_Hai004_mark_duplicates.bam", - "with_header_new_Hai006_mark_duplicates.bam", - "with_header_new_Hai007_mark_duplicates.bam", - "with_header_new_Hai012_mark_duplicates.bam", - "with_header_new_Hawi010_mark_duplicates.bam", - "with_header_new_Hawi015_mark_duplicates.bam", - "with_header_new_Hawi028_mark_duplicates.bam", - "with_header_new_Hawi032_mark_duplicates.bam" - ] - }, - { - "tool_name": "GenomeAnalysisTK_RealignerTargetCreator", - "tool_desc": "Determining (small) suspicious intervals which are likely in need of realignment", - "tool_version": "3.7-0-gcfedb67", - "tool_package": "", - "step_number": "6", - "input_uri_list": [ - "./biocompute/human_g1k_v37.fasta", - "with_header_new_B_mark_duplicates.bam", - "with_header_new_C_mark_duplicates.bam", - "with_header_new_D_mark_duplicates.bam", - "with_header_new_Hai003_mark_duplicates.bam", - "with_header_new_Hai004_mark_duplicates.bam", - "with_header_new_Hai006_mark_duplicates.bam", - "with_header_new_Hai007_mark_duplicates.bam", - "with_header_new_Hai012_mark_duplicates.bam", - "with_header_new_Hawi010_mark_duplicates.bam", - "with_header_new_Hawi015_mark_duplicates.bam", - "with_header_new_Hawi028_mark_duplicates.bam", - "with_header_new_Hawi032_mark_duplicates.bam" - ], - "output_uri_list": [ - "realignedB.list", - "realignedC.list", - "realignedD.list", - "realignedHai003.list", - "realignedHai004.list", - "realignedHai006.list", - "realignedHai007.list", - "realignedHai012.list", - "realignedHawi010.list", - "realignedHawi015.list", - "realignedHawi028.list", - "realignedHawi032.list" - ] - }, - { - "tool_name": "GenomeAnalysisTK_IndelRealigner", - "tool_desc": "Perform local realignment of reads around indels", - "tool_version": "3.7-0-gcfedb67", - "tool_package": "", - "step_number": "7", - "input_uri_list": [ - "./biocompute/human_g1k_v37.fasta", - "with_header_new_B_mark_duplicates.bam-targetIntervals", - "with_header_new_C_mark_duplicates.bam-targetIntervals", - "with_header_new_D_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai003_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai004_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai006_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai007_mark_duplicates.bam-targetIntervals", - "with_header_new_Hai012_mark_duplicates.bam-targetIntervals", - "with_header_new_Hawi010_mark_duplicates.bam-targetIntervals", - "with_header_new_Hawi015_mark_duplicates.bam-targetIntervals", - "with_header_new_Hawi028_mark_duplicates.bam-targetIntervals", - "with_header_new_Hawi032_mark_duplicates.bam-targetIntervals", - "realignedB.list", - "realignedC.list", - "realignedD.list", - "realignedHai003.list", - "realignedHai004.list", - "realignedHai006.list", - "realignedHai007.list", - "realignedHai012.list", - "realignedHawi010.list", - "realignedHawi015.list", - "realignedHawi028.list", - "realignedHawi032.list" - ], - "output_uri_list": [ - "B_realigned_reads.bam", - "C_realigned_reads.bam", - "D_realigned_reads.bam", - "Hai003_realigned_reads.bam", - "Hai004_realigned_reads.bam", - "Hai006_realigned_reads.bam", - "Hai007_realigned_reads.bam", - "Hai012_realigned_reads.bam", - "Hawi010_realigned_reads.bam", - "Hawi015_realigned_reads.bam", - "Hawi028_realigned_reads.bam", - "Hawi032_realigned_reads.bam" - ] - }, - { - "tool_name": "GenomeAnalysisTK_HaplotypeCaller", - "tool_desc": "Perform local realignment of reads around indels", - "tool_version": "3.7-0-gcfedb67", - "tool_package": "", - "step_number": "8", - "input_uri_list": [ - "B_realigned_reads.bam", - "C_realigned_reads.bam", - "D_realigned_reads.bam", - "Hai003_realigned_reads.bam", - "Hai004_realigned_reads.bam", - "Hai006_realigned_reads.bam", - "Hai007_realigned_reads.bam", - "Hai012_realigned_reads.bam", - "Hawi010_realigned_reads.bam", - "Hawi015_realigned_reads.bam", - "Hawi028_realigned_reads.bam", - "Hawi032_realigned_reads.bam" - ], - "output_uri_list": [ - "B_gatk_raw_snps.vcf", - "C_gatk_raw_snps.vcf", - "D_gatk_raw_snps.vcf", - "Hai003_gatk_raw_snps.vcf", - "Hai004_gatk_raw_snps.vcf", - "Hai006_gatk_raw_snps.vcf", - "Hai007_gatk_raw_snps.vcf", - "Hai012_gatk_raw_snps.vcf", - "Hawi010_gatk_raw_snps.vcf", - "Hawi015_gatk_raw_snps.vcf", - "Hawi028_gatk_raw_snps.vcf", - "Hawi032_gatk_raw_snps.vcf" - ] - } - ] - }, - "execution_domain": { - "script": "https://github.com/biocompute-objects//HTS-CSRS/tree/master/8_gatk_Hemophilia/make_vcfs.py", - "pipeline_version": "1.0", - "platform": "linux", - "driver": "Python 2.7.10", - "prerequisites": [ - "{\"name\":\"GenomAnalysisToolKit\",\"version\":\"3.6\"}", - "{\"name\":\"Picard\",\"version\":\"v2.6.0\"}", - "{\"name\":\"samtools\",\"version\":\"v 0.1.19-96b5f2294a\"}", - "{\"name\":\"Python\",\"version\":\"2.7.10\"}" - ], - "env_parameters": [ - "10 GB memory required" - ] - }, - "parametric_domain": { - "picard_MarkDuplicates_REMOVE_DUPLICATES": "no", - "picard_AddOrReplaceReadGroups_ReadGroupPlatform": "illumina", - "GenomeAnalysisTK_HaplotypeCaller_Intervals": "X", - "GenomeAnalysisTK_RealignerTargetCreator_Num_threads": "22" - }, - "io_domain": { - "reference_uri_list": [ - "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" - ], - "input_uri_list": [ - "B_gatk_raw_snps.vcf", - "C_gatk_raw_snps.vcf", - "D_gatk_raw_snps.vcf", - "Hai003_gatk_raw_snps.vcf", - "Hai004_gatk_raw_snps.vcf", - "Hai006_gatk_raw_snps.vcf", - "Hai007_gatk_raw_snps.vcf", - "Hai012_gatk_raw_snps.vcf", - "Hawi010_gatk_raw_snps.vcf", - "Hawi015_gatk_raw_snps.vcf", - "Hawi028_gatk_raw_snps.vcf", - "Hawi032_gatk_raw_snps.vcf" - ], - "output_uri_list": [ - "./B_S30.bam", - "./C_S31.bam", - "./D_S32.bam", - "./Hai003_S3.bam", - "./Hai004_S4.bam", - "./Hai006_S6.bam", - "./Hai007_S7.bam", - "./Hai012_S41.bam", - "./Hawi010_S16.bam", - "./Hawi015_S21.bam", - "./Hawi028_S35.bam", - "./Hawi032_S39.bam" - ] - } + "id": "obj.1298", + "name": "Identification of recombinant antihemophilic factor VII [UniProt:P00451] inhibitor SNPs [SO:0000694] in human [taxID:9606] blood [UBERON:0000178] extracted from patients with hemophilia A [DOID:12134]", + "title": "Identification of recombinant antihemophilic factor", + "version": "1.0", + "createdby": "hadley_king@gwmail.gwu.edu", + "created": "Mar 12, 2017 16:50:32", + "modified": "Jun 15, 2017 11:43:35", + "digital_signature": "as243hgdfbvsh345354jnjhjfdf", + "verification_status": "unreviewed", + "publication_status": "draft", + "usability_domain": [ + "Identify SNPs [SO:0000694] that correlate with the development of an inhibitory response to recombinant antihemophilic factor VII [UniProt:P00451] in patients with hemophilia A [DOID:12134]" + ], + "authors": [ + { + "orcid": "0000-0003-1409-4549" + }, + { + "name": "Joe Mcgill" + } + ], + "description_domain": { + "keywords": [ + "hemophilia A", + "recombinant antihemophilic factor VII", + "SNPs" + ], + "xref": [ + "taxID:9606 ", + "UBERON:0000178 ", + "DB:00025 ", + "SO:0000694", + "UniProt:P00451" + ], + "pipeline_steps": [ + { + "tool_name": "samtools_faidx", + "tool_desc": "Index reference sequence in the FASTA format", + "tool_version": "0.1.19-96b5f2294a", + "tool_package": "", + "step_number": "1", + "input_uri_list": [ + "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" + ], + "output_uri_list": [ + "./biocompute/human_g1k_v37.fasta" + ] + }, + { + "tool_name": "picard_CreateSequenceDictionary", + "tool_desc": "Creates a sequence dictionary for a reference sequence", + "tool_version": "v2.6.0", + "tool_package": "", + "step_number": "2", + "input_uri_list": [ + "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" + ], + "output_uri_list": [ + "home/jmcgill/Desktop/biocompute/human_g1k_v37.dict" + ] + }, + { + "tool_name": "samtools_sort", + "tool_desc": "Sort alignments by leftmost coordinates", + "tool_version": "", + "tool_package": "", + "step_number": "3", + "input_uri_list": [ + "./bam_files/biocompute/B_S30.bam", + "./bam_files/biocompute/C_S31.bam", + "./bam_files/biocompute/D_S32.bam", + "./bam_files/biocompute/Hai003_S3.bam", + "./bam_files/biocompute/Hai004_S4.bam", + "./bam_files/biocompute/Hai006_S6.bam", + "./bam_files/biocompute/Hai007_S7.bam", + "./bam_files/biocompute/Hai012_S41.bam", + "./bam_files/biocompute/Hawi010_S16.bam", + "./bam_files/biocompute/Hawi015_S21.bam", + "./bam_files/biocompute/Hawi028_S35.bam", + "./bam_files/biocompute/Hawi032_S39.bam" + ], + "output_uri_list": [ + "/home/biocompute/B_S30.bam_sorted", + "/home/biocompute/C_S31.bam_sorted", + "/home/biocompute/D_S32.bam_sorted", + "/home/biocompute/Hai003_S3.bam_sorted", + "/home/biocompute/Hai004_S4.bam_sorted", + "/home/biocompute/Hai006_S6.bam_sorted", + "/home/biocompute/Hai007_S7.bam_sorted", + "/home/biocompute/Hai012_S41.bam_sorted", + "/home/biocompute/Hawi010_S16.bam_sorted", + "/home/biocompute/Hawi015_S21.bam_sorted", + "/home/biocompute/Hawi028_S35.bam_sorted", + "/home/biocompute/Hawi032_S39.bam_so" + ] + }, + { + "tool_name": "picard_MarkDuplicates", + "tool_desc": "This tool locates and tags duplicate reads in a BAM or SAM file", + "tool_version": "0.1.19-96b5f2294a", + "tool_package": "", + "step_number": "4", + "input_uri_list": [ + "B_sorted.bam", + "C_sorted.bam", + "D_sorted.bam", + "Hai003_sorted.bam", + "Hai004_sorted.bam", + "Hai006_sorted.bam", + "Hai007_sorted.bam", + "Hai012_sorted.bam", + "Hawi010_sorted.bam", + "Hawi015_sorted.bam", + "Hawi028_sorted.bam", + "Hawi032_sorted.bam" + ], + "output_uri_list": [ + "new_B_mark_duplicates.bam", + "new_C_mark_duplicates.bam", + "new_D_mark_duplicates.bam", + "new_Hai003_mark_duplicates.bam", + "new_Hai004_mark_duplicates.bam", + "new_Hai006_mark_duplicates.bam", + "new_Hai007_mark_duplicates.bam", + "new_Hai012_mark_duplicates.bam", + "new_Hawi010_mark_duplicates.bam", + "new_Hawi015_mark_duplicates.bam", + "new_Hawi028_mark_duplicates.bam", + "new_Hawi032_mark_duplicates.bam", + "B.txt", + "C.txt", + "D.txt", + "Hai003.txt", + "Hai004.txt", + "Hai006.txt", + "Hai007.txt", + "Hai012.txt", + "Hawi010.txt", + "Hawi015.txt", + "Hawi028.txt", + "Hawi032.txt" + ] + }, + { + "tool_name": "picard_AddOrReplaceReadGroups", + "tool_desc": "Replace read groups in a BAM file", + "tool_version": "0.1.19-96b5f2294a", + "tool_package": "", + "step_number": "5", + "input_uri_list": [ + "new_B_mark_duplicates.bam", + "new_C_mark_duplicates.bam", + "new_D_mark_duplicates.bam", + "new_Hai003_mark_duplicates.bam", + "new_Hai004_mark_duplicates.bam", + "new_Hai006_mark_duplicates.bam", + "new_Hai007_mark_duplicates.bam", + "new_Hai012_mark_duplicates.bam", + "new_Hawi010_mark_duplicates.bam", + "new_Hawi015_mark_duplicates.bam", + "new_Hawi028_mark_duplicates.bam", + "new_Hawi032_mark_duplicates.bam" + ], + "output_uri_list": [ + "with_header_new_B_mark_duplicates.bam", + "with_header_new_C_mark_duplicates.bam", + "with_header_new_D_mark_duplicates.bam", + "with_header_new_Hai003_mark_duplicates.bam", + "with_header_new_Hai004_mark_duplicates.bam", + "with_header_new_Hai006_mark_duplicates.bam", + "with_header_new_Hai007_mark_duplicates.bam", + "with_header_new_Hai012_mark_duplicates.bam", + "with_header_new_Hawi010_mark_duplicates.bam", + "with_header_new_Hawi015_mark_duplicates.bam", + "with_header_new_Hawi028_mark_duplicates.bam", + "with_header_new_Hawi032_mark_duplicates.bam" + ] + }, + { + "tool_name": "GenomeAnalysisTK_RealignerTargetCreator", + "tool_desc": "Determining (small) suspicious intervals which are likely in need of realignment", + "tool_version": "3.7-0-gcfedb67", + "tool_package": "", + "step_number": "6", + "input_uri_list": [ + "./biocompute/human_g1k_v37.fasta", + "with_header_new_B_mark_duplicates.bam", + "with_header_new_C_mark_duplicates.bam", + "with_header_new_D_mark_duplicates.bam", + "with_header_new_Hai003_mark_duplicates.bam", + "with_header_new_Hai004_mark_duplicates.bam", + "with_header_new_Hai006_mark_duplicates.bam", + "with_header_new_Hai007_mark_duplicates.bam", + "with_header_new_Hai012_mark_duplicates.bam", + "with_header_new_Hawi010_mark_duplicates.bam", + "with_header_new_Hawi015_mark_duplicates.bam", + "with_header_new_Hawi028_mark_duplicates.bam", + "with_header_new_Hawi032_mark_duplicates.bam" + ], + "output_uri_list": [ + "realignedB.list", + "realignedC.list", + "realignedD.list", + "realignedHai003.list", + "realignedHai004.list", + "realignedHai006.list", + "realignedHai007.list", + "realignedHai012.list", + "realignedHawi010.list", + "realignedHawi015.list", + "realignedHawi028.list", + "realignedHawi032.list" + ] + }, + { + "tool_name": "GenomeAnalysisTK_IndelRealigner", + "tool_desc": "Perform local realignment of reads around indels", + "tool_version": "3.7-0-gcfedb67", + "tool_package": "", + "step_number": "7", + "input_uri_list": [ + "./biocompute/human_g1k_v37.fasta", + "with_header_new_B_mark_duplicates.bam-targetIntervals", + "with_header_new_C_mark_duplicates.bam-targetIntervals", + "with_header_new_D_mark_duplicates.bam-targetIntervals", + "with_header_new_Hai003_mark_duplicates.bam-targetIntervals", + "with_header_new_Hai004_mark_duplicates.bam-targetIntervals", + "with_header_new_Hai006_mark_duplicates.bam-targetIntervals", + "with_header_new_Hai007_mark_duplicates.bam-targetIntervals", + "with_header_new_Hai012_mark_duplicates.bam-targetIntervals", + "with_header_new_Hawi010_mark_duplicates.bam-targetIntervals", + "with_header_new_Hawi015_mark_duplicates.bam-targetIntervals", + "with_header_new_Hawi028_mark_duplicates.bam-targetIntervals", + "with_header_new_Hawi032_mark_duplicates.bam-targetIntervals", + "realignedB.list", + "realignedC.list", + "realignedD.list", + "realignedHai003.list", + "realignedHai004.list", + "realignedHai006.list", + "realignedHai007.list", + "realignedHai012.list", + "realignedHawi010.list", + "realignedHawi015.list", + "realignedHawi028.list", + "realignedHawi032.list" + ], + "output_uri_list": [ + "B_realigned_reads.bam", + "C_realigned_reads.bam", + "D_realigned_reads.bam", + "Hai003_realigned_reads.bam", + "Hai004_realigned_reads.bam", + "Hai006_realigned_reads.bam", + "Hai007_realigned_reads.bam", + "Hai012_realigned_reads.bam", + "Hawi010_realigned_reads.bam", + "Hawi015_realigned_reads.bam", + "Hawi028_realigned_reads.bam", + "Hawi032_realigned_reads.bam" + ] + }, + { + "tool_name": "GenomeAnalysisTK_HaplotypeCaller", + "tool_desc": "Perform local realignment of reads around indels", + "tool_version": "3.7-0-gcfedb67", + "tool_package": "", + "step_number": "8", + "input_uri_list": [ + "B_realigned_reads.bam", + "C_realigned_reads.bam", + "D_realigned_reads.bam", + "Hai003_realigned_reads.bam", + "Hai004_realigned_reads.bam", + "Hai006_realigned_reads.bam", + "Hai007_realigned_reads.bam", + "Hai012_realigned_reads.bam", + "Hawi010_realigned_reads.bam", + "Hawi015_realigned_reads.bam", + "Hawi028_realigned_reads.bam", + "Hawi032_realigned_reads.bam" + ], + "output_uri_list": [ + "B_gatk_raw_snps.vcf", + "C_gatk_raw_snps.vcf", + "D_gatk_raw_snps.vcf", + "Hai003_gatk_raw_snps.vcf", + "Hai004_gatk_raw_snps.vcf", + "Hai006_gatk_raw_snps.vcf", + "Hai007_gatk_raw_snps.vcf", + "Hai012_gatk_raw_snps.vcf", + "Hawi010_gatk_raw_snps.vcf", + "Hawi015_gatk_raw_snps.vcf", + "Hawi028_gatk_raw_snps.vcf", + "Hawi032_gatk_raw_snps.vcf" + ] + } + ] + }, + "execution_domain": { + "script": "https://github.com/biocompute-objects//HTS-CSRS/tree/master/8_gatk_Hemophilia/make_vcfs.py", + "pipeline_version": "1.0", + "platform": "linux", + "driver": "Python 2.7.10", + "software_prerequisites": [ + { + "name": "GenomAnalysisToolKit", + "version": "3.6" + }, + { + "name": "Picard", + "version": "v2.6.0" + }, + { + "name": "samtools", + "version": "v0.1.19-96b5f2294a" + }, + { + "name": "Python", + "version": "2.7.10" + } + ], + "domain_prerequisites": [ + { + "url": "ftp://:22/", + "name": "access to ftp" + } + ], + "env_parameters": [ + "10 GB memory required" + ], + "script_type": "URI" + }, + "parametric_domain": { + "picard_MarkDuplicates_REMOVE_DUPLICATES": "no", + "picard_AddOrReplaceReadGroups_ReadGroupPlatform": "illumina", + "GenomeAnalysisTK_HaplotypeCaller_Intervals": "X", + "GenomeAnalysisTK_RealignerTargetCreator_Num_threads": "22" + }, + "io_domain": { + "reference_uri_list": [ + "ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz" + ], + "input_uri_list": [ + "B_gatk_raw_snps.vcf", + "C_gatk_raw_snps.vcf", + "D_gatk_raw_snps.vcf", + "Hai003_gatk_raw_snps.vcf", + "Hai004_gatk_raw_snps.vcf", + "Hai006_gatk_raw_snps.vcf", + "Hai007_gatk_raw_snps.vcf", + "Hai012_gatk_raw_snps.vcf", + "Hawi010_gatk_raw_snps.vcf", + "Hawi015_gatk_raw_snps.vcf", + "Hawi028_gatk_raw_snps.vcf", + "Hawi032_gatk_raw_snps.vcf" + ], + "output_uri_list": [ + "./B_S30.bam", + "./C_S31.bam", + "./D_S32.bam", + "./Hai003_S3.bam", + "./Hai004_S4.bam", + "./Hai006_S6.bam", + "./Hai007_S7.bam", + "./Hai012_S41.bam", + "./Hawi010_S16.bam", + "./Hawi015_S21.bam", + "./Hawi028_S35.bam", + "./Hawi032_S39.bam" + ] + } } \ No newline at end of file diff --git a/9_WXSwf/bcoexample_9.json b/9_WXSwf/bcoexample_9.json index 0d955c7..d77615e 100644 --- a/9_WXSwf/bcoexample_9.json +++ b/9_WXSwf/bcoexample_9.json @@ -1,122 +1,145 @@ { "id": "obj.1481139873", "name": "Exome Sequence Analysis", - "version": "5", + "version": "5", + "createdby": "hadley_king@gwmail.gwu.edu", + "created": "Sep 27, 2017 10:35:17", + "modified": "Jun 25, 2019 14:47:49", "digital_signature": "", "verification_status": "unreviewed", "publication_status": "draft", "usability_domain": ["For use in analysis of xome sequence data"], - "authors": [{"name": "Durga"}], + "authors": [{"orcid":"https://0000-0003-1409-4549"},{"name": "Addepalli, Kanakadurga"}], "description_domain": { "xref":[], "keywords": [], - "pipeline_steps": { - "FastQC":{ - "description": "A quality control tool for high throughput sequence data.", - "version": "sbg:toolkitVersion:0.11.4", - "step_number": 1, - "package": ["http://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.4.zip"], - "input":["sbg://Durga/exome-sequencing/fastq_list.fastq.gz"], - "output":[ - "#FastQC.report_zip", - "#FastQC.report_html" - ] - }, - "SBG_Pair_FASTQs_by_Metadata":{ - "description": "Tool accepts list of FASTQ files for one sample as the input and groups them into pairs (two files for each paired end). This grouping is done using metadata values that are creating unique combination for each pair or of FASTQ files. Metadata that fields that are uniquely defining one FASTQ pair are Sample ID, Library ID, Platform unit ID and File segment number. Listed order of metadata fields is also representing their hierarchy in the metadata structure. Not all of these four metadata fields are required, but the present set has to be sufficient to create unique combinations for each pair of FASTQ files.", - "version": null, - "step_number": 1, - "package":["https://cgc.sbgenomics.com/u/stefanristeski/group-fastqs/apps/#sbg-pair-fastqs-by-metadata/9"], - "input":["#SBG_Pair_FASTQs_by_Metadata.fastq_list"], - "output":["#SBG_Pair_FASTQs_by_Metadata.tuple_list"] + "pipeline_steps": [ + { + "tool_name":"FastQC", + "tool_desc": "A quality control tool for high throughput sequence data.", + "tool_version": "sbg:toolkitVersion:0.11.4", + "step_number": "1", + "tool_package": "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.4.zip", + "input_uri_list":["sbg://Durga/exome-sequencing/fastq_list.fastq.gz"], + "output_uri_list":[ + "#FastQC.report_zip", + "#FastQC.report_html" + ] }, - "BWA_INDEX":{ - "description": "BWA INDEX constructs the FM-index (Full-text index in Minute space) for the reference genome.\nGenerated index files will be used with BWA MEM, BWA ALN, BWA SAMPE and BWA SAMSE tools.\n\nIf input reference file has TAR extension it is assumed that BWA indices came together with it. BWA INDEX will only pass that TAR to the output. If input is not TAR, the creation of BWA indices and its packing in TAR file (together with the reference) will be performed.", - "version": "sbg:toolkitVersion:0.7.13", - "step_number": 1, - "package":["http://sourceforge.net/projects/bio-bwa/"], - "input":["sbg://Durga/exome-sequencing/reference.fasta"], - "output":["#BWA_INDEX.indexed_reference"] + { + "tool_name": "SBG_Pair_FASTQs_by_Metadata", + "tool_desc": "Tool accepts list of FASTQ files for one sample as the input_uri_list and groups them into pairs (two files for each paired end). This grouping is done using metadata values that are creating unique combination for each pair or of FASTQ files. Metadata that fields that are uniquely defining one FASTQ pair are Sample ID, Library ID, Platform unit ID and File segment number. Listed order of metadata fields is also representing their hierarchy in the metadata structure. Not all of these four metadata fields are required, but the present set has to be sufficient to create unique combinations for each pair of FASTQ files.", + "tool_version": "sbg:toolkitVersion:0.7.13", + "step_number": "1", + "tool_package": "https://cgc.sbgenomics.com/u/stefanristeski/group-fastqs/apps/#sbg-pair-fastqs-by-metadata/9", + "input_uri_list":["#SBG_Pair_FASTQs_by_Metadata.fastq_list"], + "output_uri_list":["#SBG_Pair_FASTQs_by_Metadata.tuple_list"] }, - "SBG_FASTA_Indices": { - "description": "Tool allows creating FASTA dictionary and index simultaneously which is necessary for running GATK tools. This version of tool for indexing uses SAMtools faidx command (toolkit version0.1.19), while for the FASTA dictionary is used Picard CreateFastaDictionary (toolkit version 1.140)", - "version": null, - "step_number": 1, - "package":["/u/Durga/exome-sequencing/apps/#Durga/exome-sequencing/sbg-fasta-indices/0"], - "input":["#SBG_FASTA_Indices.reference"], - "output":[ "#SBG_FASTA_Indices.fasta_reference", "#SBG_FASTA_Indices.fasta_index", "#SBG_FASTA_Indices.fasta_dict"] + { + "tool_name": "BWA_INDEX", + "tool_desc": "BWA INDEX constructs the FM-index (Full-text index in Minute space) for the reference genome. Generated index files will be used with BWA MEM, BWA ALN, BWA SAMPE and BWA SAMSE tools. If input_uri_list reference file has TAR extension it is assumed that BWA indices came together with it. BWA INDEX will only pass that TAR to the output_uri_list. If input_uri_list is not TAR, the creation of BWA indices and its packing in TAR file (together with the reference) will be performed.", + "tool_version": "sbg:toolkitVersion:0.7.13", + "step_number": "1", + "tool_package": "http://sourceforge.net/projects/bio-bwa/", + "input_uri_list":["sbg://Durga/exome-sequencing/reference.fasta"], + "output_uri_list":["#BWA_INDEX.indexed_reference"] + }, + { + "tool_name": "SBG_FASTA_Indices", + "tool_desc": "Tool allows creating FASTA dictionary and index simultaneously which is necessary for running GATK tools. This tool_version of tool for indexing uses SAMtools faidx command (toolkit tool_version0.1.19), while for the FASTA dictionary is used Picard CreateFastaDictionary (toolkit tool_version 1.140)", + "tool_version": "sbg:toolkitVersion:0.7.13", + "step_number": "1", + "tool_package": "/u/Durga/exome-sequencing/apps/#Durga/exome-sequencing/sbg-fasta-indices/0", + "input_uri_list":["#SBG_FASTA_Indices.reference"], + "output_uri_list":[ "#SBG_FASTA_Indices.fasta_reference", "#SBG_FASTA_Indices.fasta_index", "#SBG_FASTA_Indices.fasta_dict"] }, - "BWA_MEM_Bundle":{ - "description": "**BWA MEM** is an algorithm designed for aligning sequence reads onto a large reference genome. BWA MEM is implemented as a component of BWA.", - "version": "sbg:toolkitVersion:0.7.13", - "step_number": 2, - "package":["http://sourceforge.net/projects/bio-bwa/"], - "input":[ + { + "tool_name": "BWA_MEM_Bundle", + "tool_desc": "**BWA MEM** is an algorithm designed for aligning sequence reads onto a large reference genome. BWA MEM is implemented as a component of BWA.", + "tool_version": "sbg:toolkitVersion:0.7.13", + "step_number": "2", + "tool_package": "http://sourceforge.net/projects/bio-bwa/", + "input_uri_list":[ "#BWA_INDEX.parameterList", "#SBG_Pair_FASTQs_by_Metadata.tuple_list", "#BWA_INDEX.indexed_reference" ], - "output":[ + "output_uri_list":[ "#BWA_MEM_Bundle.bam_index", "#BWA_MEM_Bundle.aligned_reads" ] }, - "Sambamba_Merge":{ - "description": "Sambamba Merge is used for merging several sorted BAM files into one. The sorting order of all the files must be the same, and it is maintained in the output file.", - "version": "sbg:toolkitVersion:0.5.9", - "step_number": 3, - "package":["https://github.com/lomereiter/sambamba/releases/tag/v0.5.9"], - "input":["#BWA_MEM_Bundle.aligned_reads"], - "output":["#Sambamba_Merge.merged_bam"] + { + "tool_name": "Sambamba_Merge", + "tool_desc": "Sambamba Merge is used for merging several sorted BAM files into one. The sorting order of all the files must be the same, and it is maintained in the output_uri_list file.", + "tool_version": "sbg:toolkitVersion:0.5.9", + "step_number": "3", + "tool_package": "https://github.com/lomereiter/sambamba/releases/tag/v0.5.9", + "input_uri_list":["#BWA_MEM_Bundle.aligned_reads"], + "output_uri_list":["#Sambamba_Merge.merged_bam"] }, - "Picard_BuildBamIndex":{ - "description": "Picard BuildBamIndex generates a BAM index (.bai) file.", - "version": "sbg:toolkitVersion:1.140", - "step_number": 4, - "package":["https://github.com/broadinstitute/picard/zipball/master"], - "input":["#Sambamba_Merge.merged_bam"], - "output":["#Picard_BuildBamIndex.indexed_bam","#Picard_BuildBamIndex.index"] + { + "tool_name": "Picard_BuildBamIndex", + "tool_desc": "Picard BuildBamIndex generates a BAM index (.bai) file.", + "tool_version": "sbg:toolkitVersion:1.140", + "step_number": "4", + "tool_package": "https://github.com/broadinstitute/picard/zipball/master", + "input_uri_list":["#Sambamba_Merge.merged_bam"], + "output_uri_list":["#Picard_BuildBamIndex.indexed_bam","#Picard_BuildBamIndex.index"] }, - "GATK_IndelRealigner":{ - "description": "The local realignment process is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and specifically identify indels.\n\nThere are 2 steps to the realignment process:\n\n1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)\n2. Running the realigner over those intervals (IndelRealigner)\nFor more details, see the indel realignment method documentation.\n\nInput\nOne or more aligned BAM files and optionally one or more lists of known indels.\n\nOutput\nA realigned version of your input BAM file(s).\n\nUsage example:\n java -jar GenomeAnalysisTK.jar \\\n -T IndelRealigner \\\n -R reference.fasta \\\n -I input.bam \\\n --known indels.vcf \\\n -targetIntervals intervalListFromRTC.intervals \\\n -o realignedBam.bam\n \nCaveats\n\nThe input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.\nBecause reads produced from the 454 technology inherently contain false indels, the realigner will not work with them (or with reads from similar technologies).\nThis tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.\n\n(IMPORTANT) Reference \".fasta\" Secondary Files\n\nTools in GATK that require a fasta reference file also look for the reference file's corresponding .fai (fasta index) and .dict (fasta dictionary) files. The fasta index file allows random access to reference bases and the dictionary file is a dictionary of the contig names and sizes contained within the fasta reference. These two secondary files are essential for GATK to work properly. To append these two files to your fasta reference please use the 'SBG FASTA Indices' tool within your GATK based workflow before using any of the GATK tools.", - "version": "sbg:toolkitVersion:2.3.9 Lite", - "step_number": 5, - "package":["https://www.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=2.3-9-ge5ebf34"], - "input":[ + { + "tool_name": "GATK_IndelRealigner", + "tool_desc": "The local realignment process is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and specifically identify indels. There are 2 steps to the realignment process: 1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool) 2. Running the realigner over those intervals (IndelRealigner) For more details, see the indel realignment method documentation. Input One or more aligned BAM files and optionally one or more lists of known indels. Output A realigned tool_version of your input_uri_list BAM file(s). Usage example: java -jar GenomeAnalysisTK.jar -T IndelRealigner -R reference.fasta -I input_uri_list.bam --known indels.vcf -targetIntervals intervalListFromRTC.intervals -o realignedBam.bam Caveats The input_uri_list BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. Because reads produced from the 454 technology inherently contain false indels, the realigner will not work with them (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. (IMPORTANT) Reference \".fasta\" Secondary Files Tools in GATK that require a fasta reference file also look for the reference file's corresponding .fai (fasta index) and .dict (fasta dictionary) files. The fasta index file allows random access to reference bases and the dictionary file is a dictionary of the contig names and sizes contained within the fasta reference. These two secondary files are essential for GATK to work properly. To append these two files to your fasta reference please use the 'SBG FASTA Indices' tool within your GATK based workflow before using any of the GATK tools.", + "tool_version": "sbg:toolkitVersion:2.3.9 Lite", + "step_number": "5", + "tool_package": "https://www.broadinstitute.org/gatk/download/auth?package=GATK-archive&tool_version=2.3-9-ge5ebf34", + "input_uri_list":[ "sbg://Durga/exome-sequencing/target_intervals.txt", "#SBG_FASTA_Indices.fasta_reference", "#Picard_BuildBamIndex.indexed_bam" ], - "output":["#GATK_IndelRealigner.realigned_bam_file"] + "output_uri_list":["#GATK_IndelRealigner.realigned_bam_file"] }, - "Freebayes":{ - "description": "FreeBayes is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs (single-nucleotide polymorphisms), indels (insertions and deletions), MNPs (multi-nucleotide polymorphisms), and complex events (composite insertion and substitution events) smaller than the length of a short-read sequencing alignment.\n\nFreeBayes incorporates a number of features in order to reduce the complexity of variant detection for researchers and developers:\n\n1. Indel realignment is accomplished internally using a read-independent method, and issues resulting from discordant alignments are dramatically reduced through the direct detection of haplotypes.\n2. The need for base quality recalibration is avoided through the direct detection of haplotypes. Sequencing platform errors tend to cluster (e.g. at the ends of reads), and generate unique, non-repeating haplotypes at a given locus.\n3. Variant quality recalibration is avoided by incorporating a number of metrics, such as read placement bias and allele balance, directly into the Bayesian model.\n\n###Common Issues\nFASTA INDEX FILE is not required. If it is not provided (as secondary file), it is generated. When it is provided, the tool runs faster.\nBAM INDEX FILES are not required, but when not provided (as separate input), random access is disabled.\nVARIANT INPUT INDEX FILE is required (as secondary file). It should be generated using Tabix BGZIP and Tabix Index file.\nREGION parameter should match data present in variant input file, both chromosome and positions.", - "version": "sbg:toolkitVersion:v1.0.2", - "step_number": 6, - "package":["https://github.com/ekg/freebayes"], - "input":["#SBG_FASTA_Indices.fasta_reference", "#GATK_IndelRealigner.realigned_bam_file"], - "output":["sbg://Durga/exome-sequencing/Freebayes.variants"] + { + "tool_name": "Freebayes", + "tool_desc": "FreeBayes is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs (single-nucleotide polymorphisms), indels (insertions and deletions), MNPs (multi-nucleotide polymorphisms), and complex events (composite insertion and substitution events) smaller than the length of a short-read sequencing alignment. FreeBayes incorporates a number of features in order to reduce the complexity of variant detection for researchers and developers: 1. Indel realignment is accomplished internally using a read-independent method, and issues resulting from discordant alignments are dramatically reduced through the direct detection of haplotypes. 2. The need for base quality recalibration is avoided through the direct detection of haplotypes. Sequencing platform errors tend to cluster (e.g. at the ends of reads), and generate unique, non-repeating haplotypes at a given locus. 3. Variant quality recalibration is avoided by incorporating a number of metrics, such as read placement bias and allele balance, directly into the Bayesian model. ###Common Issues FASTA INDEX FILE is not required. If it is not provided (as secondary file), it is generated. When it is provided, the tool runs faster. BAM INDEX FILES are not required, but when not provided (as separate input_uri_list), random access is disabled. VARIANT INPUT INDEX FILE is required (as secondary file). It should be generated using Tabix BGZIP and Tabix Index file. REGION parameter should match data present in variant input_uri_list file, both chromosome and positions.", + "tool_version": "sbg:toolkitVersion:v1.0.2", + "step_number": "6", + "tool_package": "https://github.com/ekg/freebayes", + "input_uri_list":["#SBG_FASTA_Indices.fasta_reference", "#GATK_IndelRealigner.realigned_bam_file"], + "output_uri_list":["sbg://Durga/exome-sequencing/Freebayes.variants"] } - } + + ] }, "execution_domain": { - "platform": "SBG", - "url": "sbg://Durga/exome-sequencing/exomeseqanalysis02-removesortaddparameters/5/raw/", + "script": "sbg://Durga/exome-sequencing/exomeseqanalysis02-removesortaddparameters/5/raw/", "pipeline_version": "5", - "env_parameters": ["MemRequirement:2500", "CPURequirement:Eval_input_read_size"], + "platform": "SBG", "driver": "SBGdriver", - "script": "sbg://Durga/exome-sequencing/exomeseqanalysis02-removesortaddparameters/5/raw/", - "prerequisites": [ + "script_type": "sbg", + "software_prerequisites": [ {"name":"FastQC","version":"sbg:toolkitVersion:0.11.4"}, - {"name":"SBG_Pair_FASTQs_by_Metadata","version": null}, + {"name":"SBG_Pair_FASTQs_by_Metadata","version": "sbg:toolkitVersion:0.7.13"}, {"name":"BWA_INDEX","version":"sbg:toolkitVersion:0.7.13"}, - {"name":"SBG_FASTA_Indices","version":null}, + {"name":"SBG_FASTA_Indices","version": "sbg:toolkitVersion:0.7.13"}, {"name":"BWA_MEM_Bundle","version":"sbg:toolkitVersion:0.7.13"}, {"name":"Sambamba_Merge","version":"sbg:toolkitVersion:0.5.9"}, {"name":"Picard_BuildBamIndex","version":"sbg:toolkitVersion:1.140"}, {"name":"GATK_IndelRealigner","version":"sbg:toolkitVersion:2.3.9 Lite"}, {"name":"Freebayes","version":"sbg:toolkitVersion:v1.0.2"} - ] + ], + "domain_prerequisites": [ + { + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/", + "name": "access to e-utils" + }, + { + "url": "https://accounts.sbgenomics.com/auth/login?", + "name": "access to SBG" + } + ], + "env_parameters": ["MemRequirement:2500", "CPURequirement:Eval_input_read_size"] }, "parametric_domain": { "FastQC_kmers": "7", @@ -178,7 +201,7 @@ "Freebayes_genotyping_max_banddepth": "6", "Freebayes_posterior_integration_limits": "1,3", "Freebayes_gvcf": "TRUE" - }, + }, "io_domain": { "reference_uri": [ "sbg://Durga/exome-sequencing/reference.fasta", @@ -194,5 +217,5 @@ "sbg://Durga/exome-sequencing/Freebayes.variants", "sbg://Durga/exome-sequencing/Sambamba_Merge.merged_bam" ] - } + } } diff --git a/schema.v1.0/infered.json b/schema.v1.0/infered.json new file mode 100644 index 0000000..43d2b6e --- /dev/null +++ b/schema.v1.0/infered.json @@ -0,0 +1,420 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "http://example.com/root.json", + "type": "object", + "title": "BioCompute Schema", + "definitions": { + }, + "required": [ + "id", + "name", + "version", + "createdby", + "created", + "modified", + "digital_signature", + "verification_status", + "publication_status", + "usability_domain", + "authors", + "description_domain", + "execution_domain", + "parametric_domain", + "io_domain" + ], + "properties": { + "id": { + "type": "string", + "title": "ID", + "examples": [ + "obj.1283" + ], + "pattern": "^(.*)$" + }, + "name": { + "type": "string", + "title": "Name", + "examples": [ + "Quasispecies analysis of HIV-1 [taxID:12721] samples taken from H1V-1 infected individuals [taxID:9606] and detection of novel HIV-1 recombinant subtypes" + ], + "pattern": "^(.*)$" + }, + "structured_name": { + "type": "string", + "title": "Structured Name", + "default": "", + "examples": [ + "Quasispecies analysis of HIV-1" + ], + "pattern": "^(.*)$" + }, + "version": { + "type": "string", + "title": "Version", + "default": "1.0", + "examples": [ + "1.0" + ], + "pattern": "^(.*)$" + }, + "digital_signature": { + "type": "string", + "title": "Digital Signature", + "examples": [ + "324kjhgiufgioerfhlsjdhbfalskjd" + ], + "pattern": "^(.*)$" + }, + "verification_status": { + "type": "string", + "title": "Verification Status", + "default": "unreviewed", + "examples": [ + "in_progress", + "unreviewed", + "reviewed", + "published", + "rejected" + ], + "pattern": "^(.*)$" + }, + "publication_status": { + "type": "string", + "title": "Publication Status", + "default": "draft", + "examples": [ + "draft", + "in_progress", + "private", + "open_access" + ], + "pattern": "^(.*)$" + }, + "authors": { + "type": "array", + "title": "Authors", + "items": { + "type": "object", + "properties": { + "orcid": { + "type": "string", + "title": "Open Researcher and Contributor ID", + "examples": [ + "https://orcid.org/0000-0003-1409-4549" + ], + "pattern": "^(.*)$" + }, + "name": { + "type": "string", + "title": "Name", + "examples": [ + "John Doe" + ], + "pattern": "^(.*)$" + } + }, + "additionalProperties": false + } + }, + "usability_domain": { + "type": "array", + "title": "Usability Domain", + "items": { + "type": "string", + "examples": [ + "This pipeline can be used for discovering the new recombinant subtypes of viruses" + ], + "pattern": "^(.*)$" + } + }, + "createdby": { + "type": "string", + "title": "Created By", + "examples": [ + "darwin@gwu.edu" + ], + "pattern": "^(.*)$" + }, + "created": { + "type": "string", + "title": "Created", + "examples": [ + "Feb 02, 2017 14:37:50" + ], + "pattern": "^(.*)$" + }, + "modified": { + "type": "string", + "title": "Modified", + "examples": [ + "Jun 16, 2017 15:04:43" + ], + "pattern": "^(.*)$" + }, + "description_domain": { + "type": "object", + "title": "Description Domain", + "required": [ + "keywords", + "xref", + "pipeline_steps" + ], + "properties": { + "keywords": { + "type": "array", + "title": "Keywords", + "items": { + "type": "string", + "examples": [ + "clonal discovery", + "quasispecies", + "clonal contigs", + "HIV-1" + ], + "pattern": "^(.*)$" + } + }, + "xref": { + "type": "array", + "title": "External References", + "items": { + "type": "string", + "examples": [ + "UBERON:0001969", + "taxID:9606", + "taxID:12721" + ], + "pattern": "^(.*)$" + } + }, + "pipeline_steps": { + "type": "array", + "title": "Pipeline Steps", + "items": { + "type": "object", + "required": [ + "tool_name", + "tool_desc", + "tool_version", + "tool_package", + "step_number", + "input_uri_list", + "output_uri_list" + ], + "properties": { + "tool_name": { + "type": "string", + "title": "Tool Name", + "examples": [ + "HIVE-hexagon" + ], + "pattern": "^(.*)$" + }, + "tool_desc": { + "type": "string", + "title": "Tool Description", + "examples": [ + "Aligns reads to a set of reference" + ], + "pattern": "^(.*)$" + }, + "tool_version": { + "type": "string", + "title": "Tool Version", + "examples": [ + "1.4.3" + ], + "pattern": "^(.*)$" + }, + "tool_package": { + "type": "string", + "title": "Tool Requirements", + "examples": [ + "HIVE" + ], + "pattern": "^(.*)$" + }, + "step_number": { + "type": "string", + "title": "Step Number", + "examples": [ + "1" + ], + "pattern": "^(.*)$" + }, + "input_uri_list": { + "type": "array", + "title": "Input URI List", + "items": { + "type": "string", + "examples": [ + "https://hive/data/nuc-read/645175" + ], + "pattern": "^(.*)$" + } + }, + "output_uri_list": { + "type": "array", + "title": "Output URI List", + "items": { + "type": "string", + "examples": [ + "https://hive/data/hitlist/557006.csv" + ], + "pattern": "^(.*)$" + } + } + } + } + } + } + }, + "execution_domain": { + "type": "object", + "title": "Execution Domain", + "required": [ + "script", + "pipeline_version", + "platform", + "driver", + "software_prerequisites", + "domain_prerequisites", + "env_parameters", + "script_type" + ], + "properties": { + "script": { + "type": "string", + "title": "Script", + "examples": [ + "hive://workflows/quasispecies_analysis_of_HIV-1_viruses_hive.py" + ], + "pattern": "^(.*)$" + }, + "pipeline_version": { + "type": "string", + "title": "Pipeline Version", + "examples": [ + "1.0" + ], + "pattern": "^(.*)$" + }, + "platform": { + "type": "string", + "title": "Platform", + "examples": [ + "HIVE" + ], + "pattern": "^(.*)$" + }, + "driver": { + "type": "string", + "title": "The Driver Schema", + "examples": [ + "//hive.biochemistry.gwu.edu/hive-driver" + ], + "pattern": "^(.*)$" + }, + "software_prerequisites": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "pattern": "^(.*)$" + }, + "version": { + "type": "string", + "pattern": "^(.*)$" + } + } + } + }, + "domain_prerequisites": { + "type": "array", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "pattern": "^(.*)$" + }, + "name": { + "type": "string", + "pattern": "^(.*)$" + } + } + } + }, + "env_parameters": { + "type": "array", + "title": "Environment Parameters", + "items": { + "type": "string", + "examples": [ + "HIVEv1.4.3" + ], + "pattern": "^(.*)$" + } + }, + "script_type": { + "type": "string", + "title": "Script Type", + "examples": [ + "URI" + ], + "pattern": "^(.*)$" + } + } + }, + "parametric_domain": { + "type": "object", + "title": "Parametric Domain" + }, + "io_domain": { + "$id": "#/properties/io_domain", + "type": "object", + "title": "The Io_domain Schema", + "required": [ + "input_uri_list", + "output_uri_list" + ], + "properties": { + "reference_uri_list": { + "type": "array", + "title": "Reference URI List", + "items": { + "type": "string", + "examples": [ + "hive://genomes/333333" + ], + "pattern": "^(.*)$" + } + }, + "input_uri_list": { + "type": "array", + "title": "Input URI List", + "items": { + "type": "string", + "examples": [ + "hive://genomes/333333" + ], + "pattern": "^(.*)$" + } + }, + "output_uri_list": { + "type": "array", + "title": "Output URI List", + "items": { + "type": "string", + "examples": [ + "hive://data/888888/popSummary" + ], + "pattern": "^(.*)$" + } + } + } + } + } +} \ No newline at end of file diff --git a/schema.v1.0/validate.py b/schema.v1.0/validate.py new file mode 100755 index 0000000..0619423 --- /dev/null +++ b/schema.v1.0/validate.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +################################################################################ + ##Validate## +"""Used to test a BCO against the schema. The following commands were used to run the script: + + cd BCO_specification/ + python -m venv env + source env/bin/activate + pip install jsonschema jsonref + python validate.py HCV1a.json $PWD/schemas/biocomputeobject.json + + """ +################################################################################ +import json +import jsonref +import os +import sys + +from jsonschema import validate +#______________________________________________________________________________# +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('json', type=argparse.FileType('r'), help="json to validate") + parser.add_argument('schema', type=argparse.FileType('r'), help="root json schema to validate against") + args = parser.parse_args() + base_uri = 'file://{}/'.format(os.path.dirname(args.schema.name)) + data = json.load(args.json) + schema = jsonref.load(args.schema, base_uri=base_uri, jsonschema=True) + return validate(data, schema) +#______________________________________________________________________________# +if __name__ == "__main__": + main() +#print a validation mesage + print("Schema Valid") \ No newline at end of file