From 3e430381daf40708d523781a6e0e7a84c70df4b5 Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Fri, 19 Jul 2024 10:46:00 -0700 Subject: [PATCH] 10.6.0 --- nmdc_schema/nmdc.py | 108 ++++-- nmdc_schema/nmdc.schema.json | 327 ++++++++++------- .../nmdc_materialized_patterns.schema.json | 337 ++++++++++-------- nmdc_schema/nmdc_materialized_patterns.yaml | 144 ++++++-- 4 files changed, 594 insertions(+), 322 deletions(-) diff --git a/nmdc_schema/nmdc.py b/nmdc_schema/nmdc.py index 00ef32abdd..0faf0e016d 100644 --- a/nmdc_schema/nmdc.py +++ b/nmdc_schema/nmdc.py @@ -1,5 +1,5 @@ # Auto generated from nmdc.yaml by pythongen.py version: 0.0.1 -# Generation date: 2024-06-21T09:39:03 +# Generation date: 2024-07-19T10:42:33 # Schema: NMDC # # id: https://w3id.org/nmdc/nmdc @@ -298,7 +298,7 @@ class MetatranscriptomeAnnotationActivityId(WorkflowExecutionActivityId): pass -class MetatranscriptomeActivityId(WorkflowExecutionActivityId): +class MetatranscriptomeExpressionAnalysisId(WorkflowExecutionActivityId): pass @@ -414,7 +414,9 @@ class Database(YAMLRoot): metagenome_assembly_set: Optional[Union[Dict[Union[str, MetagenomeAssemblyId], Union[dict, "MetagenomeAssembly"]], List[Union[dict, "MetagenomeAssembly"]]]] = empty_dict() metagenome_sequencing_activity_set: Optional[Union[Dict[Union[str, MetagenomeSequencingActivityId], Union[dict, "MetagenomeSequencingActivity"]], List[Union[dict, "MetagenomeSequencingActivity"]]]] = empty_dict() metaproteomics_analysis_activity_set: Optional[Union[Dict[Union[str, MetaproteomicsAnalysisActivityId], Union[dict, "MetaproteomicsAnalysisActivity"]], List[Union[dict, "MetaproteomicsAnalysisActivity"]]]] = empty_dict() - metatranscriptome_activity_set: Optional[Union[Dict[Union[str, MetatranscriptomeActivityId], Union[dict, "MetatranscriptomeActivity"]], List[Union[dict, "MetatranscriptomeActivity"]]]] = empty_dict() + metatranscriptome_annotation_set: Optional[Union[Dict[Union[str, MetatranscriptomeAnnotationActivityId], Union[dict, "MetatranscriptomeAnnotationActivity"]], List[Union[dict, "MetatranscriptomeAnnotationActivity"]]]] = empty_dict() + metatranscriptome_assembly_set: Optional[Union[Dict[Union[str, MetatranscriptomeAssemblyId], Union[dict, "MetatranscriptomeAssembly"]], List[Union[dict, "MetatranscriptomeAssembly"]]]] = empty_dict() + metatranscriptome_expression_analysis_set: Optional[Union[Dict[Union[str, MetatranscriptomeExpressionAnalysisId], Union[dict, "MetatranscriptomeExpressionAnalysis"]], List[Union[dict, "MetatranscriptomeExpressionAnalysis"]]]] = empty_dict() nom_analysis_activity_set: Optional[Union[Dict[Union[str, NomAnalysisActivityId], Union[dict, "NomAnalysisActivity"]], List[Union[dict, "NomAnalysisActivity"]]]] = empty_dict() omics_processing_set: Optional[Union[Dict[Union[str, OmicsProcessingId], Union[dict, "OmicsProcessing"]], List[Union[dict, "OmicsProcessing"]]]] = empty_dict() planned_process_set: Optional[Union[Dict[Union[str, PlannedProcessId], Union[dict, "PlannedProcess"]], List[Union[dict, "PlannedProcess"]]]] = empty_dict() @@ -461,7 +463,11 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): self._normalize_inlined_as_list(slot_name="metaproteomics_analysis_activity_set", slot_type=MetaproteomicsAnalysisActivity, key_name="id", keyed=True) - self._normalize_inlined_as_list(slot_name="metatranscriptome_activity_set", slot_type=MetatranscriptomeActivity, key_name="id", keyed=True) + self._normalize_inlined_as_list(slot_name="metatranscriptome_annotation_set", slot_type=MetatranscriptomeAnnotationActivity, key_name="id", keyed=True) + + self._normalize_inlined_as_list(slot_name="metatranscriptome_assembly_set", slot_type=MetatranscriptomeAssembly, key_name="id", keyed=True) + + self._normalize_inlined_as_list(slot_name="metatranscriptome_expression_analysis_set", slot_type=MetatranscriptomeExpressionAnalysis, key_name="id", keyed=True) self._normalize_inlined_as_list(slot_name="nom_analysis_activity_set", slot_type=NomAnalysisActivity, key_name="id", keyed=True) @@ -4007,6 +4013,8 @@ class LibraryPreparation(BiosampleProcessing): library_preparation_kit: Optional[str] = None library_type: Optional[Union[str, "LibraryTypeEnum"]] = None pcr_cycles: Optional[int] = None + is_stranded: Optional[Union[bool, Bool]] = None + stranded_orientation: Optional[Union[str, "StrandedOrientationEnum"]] = None def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): if self._is_empty(self.id): @@ -4035,6 +4043,12 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): if self.pcr_cycles is not None and not isinstance(self.pcr_cycles, int): self.pcr_cycles = int(self.pcr_cycles) + if self.is_stranded is not None and not isinstance(self.is_stranded, Bool): + self.is_stranded = Bool(self.is_stranded) + + if self.stranded_orientation is not None and not isinstance(self.stranded_orientation, StrandedOrientationEnum): + self.stranded_orientation = StrandedOrientationEnum(self.stranded_orientation) + super().__post_init__(**kwargs) self.designated_class = str(self.class_class_curie) @@ -5600,18 +5614,18 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): @dataclass -class MetatranscriptomeActivity(WorkflowExecutionActivity): +class MetatranscriptomeExpressionAnalysis(WorkflowExecutionActivity): """ - A metatranscriptome activity that e.g. pools assembly and annotation activity. + A workflow process that provides expression values and read counts for gene features predicted on the contigs. """ _inherited_slots: ClassVar[List[str]] = [] - class_class_uri: ClassVar[URIRef] = NMDC["MetatranscriptomeActivity"] - class_class_curie: ClassVar[str] = "nmdc:MetatranscriptomeActivity" - class_name: ClassVar[str] = "MetatranscriptomeActivity" - class_model_uri: ClassVar[URIRef] = NMDC.MetatranscriptomeActivity + class_class_uri: ClassVar[URIRef] = NMDC["MetatranscriptomeExpressionAnalysis"] + class_class_curie: ClassVar[str] = "nmdc:MetatranscriptomeExpressionAnalysis" + class_name: ClassVar[str] = "MetatranscriptomeExpressionAnalysis" + class_model_uri: ClassVar[URIRef] = NMDC.MetatranscriptomeExpressionAnalysis - id: Union[str, MetatranscriptomeActivityId] = None + id: Union[str, MetatranscriptomeExpressionAnalysisId] = None execution_resource: str = None git_url: str = None has_input: Union[Union[str, NamedThingId], List[Union[str, NamedThingId]]] = None @@ -5622,8 +5636,8 @@ class MetatranscriptomeActivity(WorkflowExecutionActivity): def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): if self._is_empty(self.id): self.MissingRequiredField("id") - if not isinstance(self.id, MetatranscriptomeActivityId): - self.id = MetatranscriptomeActivityId(self.id) + if not isinstance(self.id, MetatranscriptomeExpressionAnalysisId): + self.id = MetatranscriptomeExpressionAnalysisId(self.id) if self.type is not None and not isinstance(self.type, str): self.type = str(self.type) @@ -5923,6 +5937,26 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): # Enumerations +class StrandedOrientationEnum(EnumDefinitionImpl): + """ + This enumeration specifies information about stranded RNA library preparations. + """ + _defn = EnumDefinition( + name="StrandedOrientationEnum", + description="This enumeration specifies information about stranded RNA library preparations.", + ) + + @classmethod + def _addvals(cls): + setattr(cls, "antisense orientation", + PermissibleValue( + text="antisense orientation", + description="Orientation that is complementary (non-coding) to a sequence of messenger RNA.")) + setattr(cls, "sense orientation", + PermissibleValue( + text="sense orientation", + description="Orientation that corresponds to the coding sequence of messenger RNA.")) + class InstrumentModelEnum(EnumDefinitionImpl): Orbitrap = PermissibleValue(text="Orbitrap") @@ -6327,6 +6361,22 @@ def _addvals(cls): PermissibleValue( text="LC-DDA-MS/MS Raw Data", description="Liquid chromatographically separated MS1 and Data-Dependent MS2 binary instrument file")) + setattr(cls, "Metatranscriptome Expression", + PermissibleValue( + text="Metatranscriptome Expression", + description="""Metatranscriptome expression values and read counts for gene features predicted on contigs""")) + setattr(cls, "Metatranscriptome Expression Intergenic", + PermissibleValue( + text="Metatranscriptome Expression Intergenic", + description="Metatranscriptome expression values and read counts for intergenic regions.")) + setattr(cls, "Metatranscriptome Expression Info File", + PermissibleValue( + text="Metatranscriptome Expression Info File", + description="File containing version information on the expression workflow")) + setattr(cls, "rRNA Filtered Sequencing Reads", + PermissibleValue( + text="rRNA Filtered Sequencing Reads", + description="File containing ribosomal reads from the read qc filtering step.")) class CreditEnum(EnumDefinitionImpl): @@ -6517,9 +6567,15 @@ class FailureWhereEnum(EnumDefinitionImpl): MetagenomeAssembly = PermissibleValue( text="MetagenomeAssembly", description="A failure has occurred in metagenome assembly, a workflow process.") - MetatranscriptomeActivity = PermissibleValue( - text="MetatranscriptomeActivity", - description="A failure has occurred in metatranscriptome analysis, a workflow process.") + MetatranscriptomeExpressionAnalysis = PermissibleValue( + text="MetatranscriptomeExpressionAnalysis", + description="A failure has occurred in metatranscriptome expression analysis, a workflow process.") + MetatranscriptomeAnnotation = PermissibleValue( + text="MetatranscriptomeAnnotation", + description="A failure has occurred in metatranscriptome annotation analysis, a workflow process.") + MetatranscriptomeAssembly = PermissibleValue( + text="MetatranscriptomeAssembly", + description="A failure has occurred in metatranscriptome assembly analysis, a workflow process.") MagsAnalysisActivity = PermissibleValue( text="MagsAnalysisActivity", description="""A failure has occurred in binning, a workflow process to generate metagenome-assembled genomes (MAGS).""") @@ -8747,6 +8803,12 @@ class slots: slots.pcr_cycles = Slot(uri=NMDC.pcr_cycles, name="pcr_cycles", curie=NMDC.curie('pcr_cycles'), model_uri=NMDC.pcr_cycles, domain=None, range=Optional[int]) +slots.is_stranded = Slot(uri=NMDC.is_stranded, name="is_stranded", curie=NMDC.curie('is_stranded'), + model_uri=NMDC.is_stranded, domain=None, range=Optional[Union[bool, Bool]]) + +slots.stranded_orientation = Slot(uri=NMDC.stranded_orientation, name="stranded_orientation", curie=NMDC.curie('stranded_orientation'), + model_uri=NMDC.stranded_orientation, domain=None, range=Optional[Union[str, "StrandedOrientationEnum"]]) + slots.mass = Slot(uri=NMDC.mass, name="mass", curie=NMDC.curie('mass'), model_uri=NMDC.mass, domain=None, range=Optional[Union[dict, QuantityValue]]) @@ -8867,8 +8929,8 @@ class slots: slots.metagenome_sequencing_activity_set = Slot(uri=NMDC.metagenome_sequencing_activity_set, name="metagenome_sequencing_activity_set", curie=NMDC.curie('metagenome_sequencing_activity_set'), model_uri=NMDC.metagenome_sequencing_activity_set, domain=Database, range=Optional[Union[Dict[Union[str, MetagenomeSequencingActivityId], Union[dict, "MetagenomeSequencingActivity"]], List[Union[dict, "MetagenomeSequencingActivity"]]]]) -slots.metatranscriptome_activity_set = Slot(uri=NMDC.metatranscriptome_activity_set, name="metatranscriptome_activity_set", curie=NMDC.curie('metatranscriptome_activity_set'), - model_uri=NMDC.metatranscriptome_activity_set, domain=Database, range=Optional[Union[Dict[Union[str, MetatranscriptomeActivityId], Union[dict, "MetatranscriptomeActivity"]], List[Union[dict, "MetatranscriptomeActivity"]]]]) +slots.metatranscriptome_expression_analysis_set = Slot(uri=NMDC.metatranscriptome_expression_analysis_set, name="metatranscriptome_expression_analysis_set", curie=NMDC.curie('metatranscriptome_expression_analysis_set'), + model_uri=NMDC.metatranscriptome_expression_analysis_set, domain=Database, range=Optional[Union[Dict[Union[str, MetatranscriptomeExpressionAnalysisId], Union[dict, "MetatranscriptomeExpressionAnalysis"]], List[Union[dict, "MetatranscriptomeExpressionAnalysis"]]]]) slots.read_qc_analysis_activity_set = Slot(uri=NMDC.read_qc_analysis_activity_set, name="read_qc_analysis_activity_set", curie=NMDC.curie('read_qc_analysis_activity_set'), model_uri=NMDC.read_qc_analysis_activity_set, domain=Database, range=Optional[Union[Dict[Union[str, ReadQcAnalysisActivityId], Union[dict, "ReadQcAnalysisActivity"]], List[Union[dict, "ReadQcAnalysisActivity"]]]]) @@ -8894,6 +8956,12 @@ class slots: slots.library_preparation_set = Slot(uri=NMDC.library_preparation_set, name="library_preparation_set", curie=NMDC.curie('library_preparation_set'), model_uri=NMDC.library_preparation_set, domain=Database, range=Optional[Union[Dict[Union[str, LibraryPreparationId], Union[dict, "LibraryPreparation"]], List[Union[dict, "LibraryPreparation"]]]]) +slots.metatranscriptome_assembly_set = Slot(uri=NMDC.metatranscriptome_assembly_set, name="metatranscriptome_assembly_set", curie=NMDC.curie('metatranscriptome_assembly_set'), + model_uri=NMDC.metatranscriptome_assembly_set, domain=Database, range=Optional[Union[Dict[Union[str, MetatranscriptomeAssemblyId], Union[dict, "MetatranscriptomeAssembly"]], List[Union[dict, "MetatranscriptomeAssembly"]]]]) + +slots.metatranscriptome_annotation_set = Slot(uri=NMDC.metatranscriptome_annotation_set, name="metatranscriptome_annotation_set", curie=NMDC.curie('metatranscriptome_annotation_set'), + model_uri=NMDC.metatranscriptome_annotation_set, domain=Database, range=Optional[Union[Dict[Union[str, MetatranscriptomeAnnotationActivityId], Union[dict, "MetatranscriptomeAnnotationActivity"]], List[Union[dict, "MetatranscriptomeAnnotationActivity"]]]]) + slots.omics_type = Slot(uri=NMDC.omics_type, name="omics_type", curie=NMDC.curie('omics_type'), model_uri=NMDC.omics_type, domain=OmicsProcessing, range=Optional[Union[dict, "ControlledTermValue"]]) @@ -11725,8 +11793,8 @@ class slots: model_uri=NMDC.MetatranscriptomeAnnotationActivity_id, domain=MetatranscriptomeAnnotationActivity, range=Union[str, MetatranscriptomeAnnotationActivityId], pattern=re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\-\/\.,]*$')) -slots.MetatranscriptomeActivity_id = Slot(uri=NMDC.id, name="MetatranscriptomeActivity_id", curie=NMDC.curie('id'), - model_uri=NMDC.MetatranscriptomeActivity_id, domain=MetatranscriptomeActivity, range=Union[str, MetatranscriptomeActivityId], +slots.MetatranscriptomeExpressionAnalysis_id = Slot(uri=NMDC.id, name="MetatranscriptomeExpressionAnalysis_id", curie=NMDC.curie('id'), + model_uri=NMDC.MetatranscriptomeExpressionAnalysis_id, domain=MetatranscriptomeExpressionAnalysis, range=Union[str, MetatranscriptomeExpressionAnalysisId], pattern=re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\-\/\.,]*$')) slots.MagsAnalysisActivity_id = Slot(uri=NMDC.id, name="MagsAnalysisActivity_id", curie=NMDC.curie('id'), diff --git a/nmdc_schema/nmdc.schema.json b/nmdc_schema/nmdc.schema.json index da3fd61650..b93ff1d2fc 100644 --- a/nmdc_schema/nmdc.schema.json +++ b/nmdc_schema/nmdc.schema.json @@ -3706,10 +3706,24 @@ }, "type": "array" }, - "metatranscriptome_activity_set": { - "description": "This property links a database object to the set of metatranscriptome analysis activities.", + "metatranscriptome_annotation_set": { + "description": "This property links a database object to the set of metatranscriptome annotations within it.", "items": { - "$ref": "#/$defs/MetatranscriptomeActivity" + "$ref": "#/$defs/MetatranscriptomeAnnotationActivity" + }, + "type": "array" + }, + "metatranscriptome_assembly_set": { + "description": "This property links a database object to the set of metatranscriptome assemblies within it.", + "items": { + "$ref": "#/$defs/MetatranscriptomeAssembly" + }, + "type": "array" + }, + "metatranscriptome_expression_analysis_set": { + "description": "This property links a database object to the set of metatranscriptome expression analysis activities.", + "items": { + "$ref": "#/$defs/MetatranscriptomeExpressionAnalysis" }, "type": "array" }, @@ -4279,7 +4293,9 @@ "Extraction", "LibraryPreparation", "MetagenomeAssembly", - "MetatranscriptomeActivity", + "MetatranscriptomeExpressionAnalysis", + "MetatranscriptomeAnnotation", + "MetatranscriptomeAssembly", "MagsAnalysisActivity", "ReadQcAnalysisActivity", "ReadBasedTaxonomyAnalysisActivity", @@ -4462,7 +4478,11 @@ "Pfam Annotation GFF", "Annotation Statistics", "Direct Infusion FT ICR-MS Raw Data", - "LC-DDA-MS/MS Raw Data" + "LC-DDA-MS/MS Raw Data", + "Metatranscriptome Expression", + "Metatranscriptome Expression Intergenic", + "Metatranscriptome Expression Info File", + "rRNA Filtered Sequencing Reads" ], "title": "FileTypeEnum", "type": "string" @@ -5131,6 +5151,10 @@ "description": "The name of the instrument that was used for processing the sample.", "type": "string" }, + "is_stranded": { + "description": "Is the (RNA) library stranded or non-stranded (unstranded).", + "type": "boolean" + }, "library_preparation_kit": { "type": "string" }, @@ -5162,6 +5186,10 @@ "start_date": { "description": "The date on which any process or activity was started", "type": "string" + }, + "stranded_orientation": { + "$ref": "#/$defs/StrandedOrientationEnum", + "description": "Lists the strand orientiation for a stranded RNA library preparation." } }, "required": [ @@ -6307,7 +6335,7 @@ "title": "MetaproteomicsAnalysisActivity", "type": "object" }, - "MetatranscriptomeActivity": { + "MetatranscriptomeAnnotationActivity": { "additionalProperties": false, "allOf": [ { @@ -6351,7 +6379,7 @@ } } ], - "description": "A metatranscriptome activity that e.g. pools assembly and annotation activity.", + "description": "", "properties": { "alternative_identifiers": { "description": "A list of alternative identifiers for the entity.", @@ -6371,6 +6399,14 @@ "git_url": { "type": "string" }, + "gold_analysis_project_identifiers": { + "description": "identifiers for corresponding analysis project in GOLD", + "items": { + "pattern": "^gold:Ga[0-9]+$", + "type": "string" + }, + "type": "array" + }, "has_failure_categorization": { "items": { "$ref": "#/$defs/FailureCategorization" @@ -6442,10 +6478,10 @@ "started_at_time", "ended_at_time" ], - "title": "MetatranscriptomeActivity", + "title": "MetatranscriptomeAnnotationActivity", "type": "object" }, - "MetatranscriptomeAnnotationActivity": { + "MetatranscriptomeAssembly": { "additionalProperties": false, "allOf": [ { @@ -6499,6 +6535,46 @@ }, "type": "array" }, + "asm_score": { + "description": "A score for comparing metagenomic assembly quality from same sample.", + "type": "number" + }, + "contig_bp": { + "description": "Total size in bp of all contigs.", + "type": "number" + }, + "contigs": { + "description": "The sum of the (length*log(length)) of all contigs, times some constant. Increase the contiguity, the score will increase", + "type": "number" + }, + "ctg_l50": { + "description": "Given a set of contigs, the L50 is defined as the sequence length of the shortest contig at 50% of the total genome length.", + "type": "number" + }, + "ctg_l90": { + "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all contigs of that length or longer contains at least 90% of the sum of the lengths of all contigs.", + "type": "number" + }, + "ctg_logsum": { + "description": "Maximum contig length.", + "type": "number" + }, + "ctg_max": { + "description": "Maximum contig length.", + "type": "number" + }, + "ctg_n50": { + "description": "Given a set of contigs, each with its own length, the N50 count is defined as the smallest number_of_contigs whose length sum makes up half of genome size.", + "type": "number" + }, + "ctg_n90": { + "description": "Given a set of contigs, each with its own length, the N90 count is defined as the smallest number of contigs whose length sum makes up 90% of genome size.", + "type": "number" + }, + "ctg_powsum": { + "description": "Powersum of all contigs is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", + "type": "number" + }, "ended_at_time": { "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", "type": "string" @@ -6506,17 +6582,21 @@ "execution_resource": { "type": "string" }, + "gap_pct": { + "description": "The gap size percentage of all scaffolds.", + "type": "number" + }, + "gc_avg": { + "description": "Average of GC content of all contigs.", + "type": "number" + }, + "gc_std": { + "description": "Standard deviation of GC content of all contigs.", + "type": "number" + }, "git_url": { "type": "string" }, - "gold_analysis_project_identifiers": { - "description": "identifiers for corresponding analysis project in GOLD", - "items": { - "pattern": "^gold:Ga[0-9]+$", - "type": "string" - }, - "type": "array" - }, "has_failure_categorization": { "items": { "$ref": "#/$defs/FailureCategorization" @@ -6542,10 +6622,22 @@ "pattern": "^[a-zA-Z0-9][a-zA-Z0-9_\\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\\-\\/\\.,]*$", "type": "string" }, + "insdc_assembly_identifiers": { + "pattern": "^insdc.sra:[A-Z]+[0-9]+(\\.[0-9]+)?$", + "type": "string" + }, "name": { "description": "A human readable label for an entity", "type": "string" }, + "num_aligned_reads": { + "description": "The sequence count number of input reads aligned to assembled contigs.", + "type": "number" + }, + "num_input_reads": { + "description": "The sequence count number of input reads for assembly.", + "type": "number" + }, "part_of": { "description": "Links a resource to another resource that either logically or physically includes it.", "items": { @@ -6561,6 +6653,54 @@ "$ref": "#/$defs/StatusEnum", "description": "Stores information about the result of a process (ie the process of sequencing a library may have for qc_status of 'fail' if not enough data was generated)" }, + "scaf_bp": { + "description": "Total size in bp of all scaffolds.", + "type": "number" + }, + "scaf_l50": { + "description": "Given a set of scaffolds, the L50 is defined as the sequence length of the shortest scaffold at 50% of the total genome length.", + "type": "number" + }, + "scaf_l90": { + "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all scaffolds of that length or longer contains at least 90% of the sum of the lengths of all scaffolds.", + "type": "number" + }, + "scaf_l_gt50k": { + "description": "Total size in bp of all scaffolds greater than 50 KB.", + "type": "number" + }, + "scaf_logsum": { + "description": "The sum of the (length*log(length)) of all scaffolds, times some constant. Increase the contiguity, the score will increase", + "type": "number" + }, + "scaf_max": { + "description": "Maximum scaffold length.", + "type": "number" + }, + "scaf_n50": { + "description": "Given a set of scaffolds, each with its own length, the N50 count is defined as the smallest number of scaffolds whose length sum makes up half of genome size.", + "type": "number" + }, + "scaf_n90": { + "description": "Given a set of scaffolds, each with its own length, the N90 count is defined as the smallest number of scaffolds whose length sum makes up 90% of genome size.", + "type": "number" + }, + "scaf_n_gt50k": { + "description": "Total sequence count of scaffolds greater than 50 KB.", + "type": "number" + }, + "scaf_pct_gt50k": { + "description": "Total sequence size percentage of scaffolds greater than 50 KB.", + "type": "number" + }, + "scaf_powsum": { + "description": "Powersum of all scaffolds is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", + "type": "number" + }, + "scaffolds": { + "description": "Total sequence count of all scaffolds.", + "type": "number" + }, "started_at_time": { "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", "type": "string" @@ -6580,18 +6720,18 @@ } }, "required": [ - "type", "execution_resource", "git_url", "has_input", + "type", "id", "started_at_time", "ended_at_time" ], - "title": "MetatranscriptomeAnnotationActivity", + "title": "MetatranscriptomeAssembly", "type": "object" }, - "MetatranscriptomeAssembly": { + "MetatranscriptomeExpressionAnalysis": { "additionalProperties": false, "allOf": [ { @@ -6635,7 +6775,7 @@ } } ], - "description": "", + "description": "A workflow process that provides expression values and read counts for gene features predicted on the contigs.", "properties": { "alternative_identifiers": { "description": "A list of alternative identifiers for the entity.", @@ -6645,46 +6785,6 @@ }, "type": "array" }, - "asm_score": { - "description": "A score for comparing metagenomic assembly quality from same sample.", - "type": "number" - }, - "contig_bp": { - "description": "Total size in bp of all contigs.", - "type": "number" - }, - "contigs": { - "description": "The sum of the (length*log(length)) of all contigs, times some constant. Increase the contiguity, the score will increase", - "type": "number" - }, - "ctg_l50": { - "description": "Given a set of contigs, the L50 is defined as the sequence length of the shortest contig at 50% of the total genome length.", - "type": "number" - }, - "ctg_l90": { - "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all contigs of that length or longer contains at least 90% of the sum of the lengths of all contigs.", - "type": "number" - }, - "ctg_logsum": { - "description": "Maximum contig length.", - "type": "number" - }, - "ctg_max": { - "description": "Maximum contig length.", - "type": "number" - }, - "ctg_n50": { - "description": "Given a set of contigs, each with its own length, the N50 count is defined as the smallest number_of_contigs whose length sum makes up half of genome size.", - "type": "number" - }, - "ctg_n90": { - "description": "Given a set of contigs, each with its own length, the N90 count is defined as the smallest number of contigs whose length sum makes up 90% of genome size.", - "type": "number" - }, - "ctg_powsum": { - "description": "Powersum of all contigs is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", - "type": "number" - }, "ended_at_time": { "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", "type": "string" @@ -6692,18 +6792,6 @@ "execution_resource": { "type": "string" }, - "gap_pct": { - "description": "The gap size percentage of all scaffolds.", - "type": "number" - }, - "gc_avg": { - "description": "Average of GC content of all contigs.", - "type": "number" - }, - "gc_std": { - "description": "Standard deviation of GC content of all contigs.", - "type": "number" - }, "git_url": { "type": "string" }, @@ -6732,22 +6820,10 @@ "pattern": "^[a-zA-Z0-9][a-zA-Z0-9_\\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\\-\\/\\.,]*$", "type": "string" }, - "insdc_assembly_identifiers": { - "pattern": "^insdc.sra:[A-Z]+[0-9]+(\\.[0-9]+)?$", - "type": "string" - }, "name": { "description": "A human readable label for an entity", "type": "string" }, - "num_aligned_reads": { - "description": "The sequence count number of input reads aligned to assembled contigs.", - "type": "number" - }, - "num_input_reads": { - "description": "The sequence count number of input reads for assembly.", - "type": "number" - }, "part_of": { "description": "Links a resource to another resource that either logically or physically includes it.", "items": { @@ -6763,54 +6839,6 @@ "$ref": "#/$defs/StatusEnum", "description": "Stores information about the result of a process (ie the process of sequencing a library may have for qc_status of 'fail' if not enough data was generated)" }, - "scaf_bp": { - "description": "Total size in bp of all scaffolds.", - "type": "number" - }, - "scaf_l50": { - "description": "Given a set of scaffolds, the L50 is defined as the sequence length of the shortest scaffold at 50% of the total genome length.", - "type": "number" - }, - "scaf_l90": { - "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all scaffolds of that length or longer contains at least 90% of the sum of the lengths of all scaffolds.", - "type": "number" - }, - "scaf_l_gt50k": { - "description": "Total size in bp of all scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_logsum": { - "description": "The sum of the (length*log(length)) of all scaffolds, times some constant. Increase the contiguity, the score will increase", - "type": "number" - }, - "scaf_max": { - "description": "Maximum scaffold length.", - "type": "number" - }, - "scaf_n50": { - "description": "Given a set of scaffolds, each with its own length, the N50 count is defined as the smallest number of scaffolds whose length sum makes up half of genome size.", - "type": "number" - }, - "scaf_n90": { - "description": "Given a set of scaffolds, each with its own length, the N90 count is defined as the smallest number of scaffolds whose length sum makes up 90% of genome size.", - "type": "number" - }, - "scaf_n_gt50k": { - "description": "Total sequence count of scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_pct_gt50k": { - "description": "Total sequence size percentage of scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_powsum": { - "description": "Powersum of all scaffolds is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", - "type": "number" - }, - "scaffolds": { - "description": "Total sequence count of all scaffolds.", - "type": "number" - }, "started_at_time": { "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", "type": "string" @@ -6830,15 +6858,15 @@ } }, "required": [ + "type", "execution_resource", "git_url", "has_input", - "type", "id", "started_at_time", "ended_at_time" ], - "title": "MetatranscriptomeAssembly", + "title": "MetatranscriptomeExpressionAnalysis", "type": "object" }, "MixingProcess": { @@ -8617,6 +8645,15 @@ "title": "StatusEnum", "type": "string" }, + "StrandedOrientationEnum": { + "description": "This enumeration specifies information about stranded RNA library preparations.", + "enum": [ + "antisense orientation", + "sense orientation" + ], + "title": "StrandedOrientationEnum", + "type": "string" + }, "Study": { "additionalProperties": false, "description": "A study summarizes the overall goal of a research initiative and outlines the key objective of its underlying projects.", @@ -9411,10 +9448,24 @@ }, "type": "array" }, - "metatranscriptome_activity_set": { - "description": "This property links a database object to the set of metatranscriptome analysis activities.", + "metatranscriptome_annotation_set": { + "description": "This property links a database object to the set of metatranscriptome annotations within it.", + "items": { + "$ref": "#/$defs/MetatranscriptomeAnnotationActivity" + }, + "type": "array" + }, + "metatranscriptome_assembly_set": { + "description": "This property links a database object to the set of metatranscriptome assemblies within it.", + "items": { + "$ref": "#/$defs/MetatranscriptomeAssembly" + }, + "type": "array" + }, + "metatranscriptome_expression_analysis_set": { + "description": "This property links a database object to the set of metatranscriptome expression analysis activities.", "items": { - "$ref": "#/$defs/MetatranscriptomeActivity" + "$ref": "#/$defs/MetatranscriptomeExpressionAnalysis" }, "type": "array" }, diff --git a/nmdc_schema/nmdc_materialized_patterns.schema.json b/nmdc_schema/nmdc_materialized_patterns.schema.json index 97983ed69e..d73aae9886 100644 --- a/nmdc_schema/nmdc_materialized_patterns.schema.json +++ b/nmdc_schema/nmdc_materialized_patterns.schema.json @@ -3597,7 +3597,7 @@ "type": "string" }, "was_generated_by": { - "pattern": "^^(nmdc):(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$|^^(nmdc):omprc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})$", + "pattern": "^(nmdc):(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$|^^(nmdc):omprc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})$", "type": "string" } }, @@ -3715,10 +3715,24 @@ }, "type": "array" }, - "metatranscriptome_activity_set": { - "description": "This property links a database object to the set of metatranscriptome analysis activities.", + "metatranscriptome_annotation_set": { + "description": "This property links a database object to the set of metatranscriptome annotations within it.", "items": { - "$ref": "#/$defs/MetatranscriptomeActivity" + "$ref": "#/$defs/MetatranscriptomeAnnotationActivity" + }, + "type": "array" + }, + "metatranscriptome_assembly_set": { + "description": "This property links a database object to the set of metatranscriptome assemblies within it.", + "items": { + "$ref": "#/$defs/MetatranscriptomeAssembly" + }, + "type": "array" + }, + "metatranscriptome_expression_analysis_set": { + "description": "This property links a database object to the set of metatranscriptome expression analysis activities.", + "items": { + "$ref": "#/$defs/MetatranscriptomeExpressionAnalysis" }, "type": "array" }, @@ -4290,7 +4304,9 @@ "Extraction", "LibraryPreparation", "MetagenomeAssembly", - "MetatranscriptomeActivity", + "MetatranscriptomeExpressionAnalysis", + "MetatranscriptomeAnnotation", + "MetatranscriptomeAssembly", "MagsAnalysisActivity", "ReadQcAnalysisActivity", "ReadBasedTaxonomyAnalysisActivity", @@ -4474,7 +4490,11 @@ "Pfam Annotation GFF", "Annotation Statistics", "Direct Infusion FT ICR-MS Raw Data", - "LC-DDA-MS/MS Raw Data" + "LC-DDA-MS/MS Raw Data", + "Metatranscriptome Expression", + "Metatranscriptome Expression Intergenic", + "Metatranscriptome Expression Info File", + "rRNA Filtered Sequencing Reads" ], "title": "FileTypeEnum", "type": "string" @@ -4735,7 +4755,7 @@ }, "metagenome_annotation_id": { "description": "The identifier for the analysis activity (WorkflowExecutionActivity) that generated the functional annotation results.", - "pattern": "^(nmdc):(wfmgan)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$", + "pattern": "^(nmdc):(wfmgan|wfmtan)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$", "type": "string" } }, @@ -5149,6 +5169,10 @@ "description": "The name of the instrument that was used for processing the sample.", "type": "string" }, + "is_stranded": { + "description": "Is the (RNA) library stranded or non-stranded (unstranded).", + "type": "boolean" + }, "library_preparation_kit": { "type": "string" }, @@ -5180,6 +5204,10 @@ "start_date": { "description": "The date on which any process or activity was started", "type": "string" + }, + "stranded_orientation": { + "$ref": "#/$defs/StrandedOrientationEnum", + "description": "Lists the strand orientiation for a stranded RNA library preparation." } }, "required": [ @@ -6343,7 +6371,7 @@ "title": "MetaproteomicsAnalysisActivity", "type": "object" }, - "MetatranscriptomeActivity": { + "MetatranscriptomeAnnotationActivity": { "additionalProperties": false, "allOf": [ { @@ -6387,7 +6415,7 @@ } } ], - "description": "A metatranscriptome activity that e.g. pools assembly and annotation activity.", + "description": "", "properties": { "alternative_identifiers": { "description": "A list of alternative identifiers for the entity.", @@ -6407,6 +6435,14 @@ "git_url": { "type": "string" }, + "gold_analysis_project_identifiers": { + "description": "identifiers for corresponding analysis project in GOLD", + "items": { + "pattern": "^gold:Ga[0-9]+$", + "type": "string" + }, + "type": "array" + }, "has_failure_categorization": { "items": { "$ref": "#/$defs/FailureCategorization" @@ -6431,7 +6467,7 @@ }, "id": { "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmt-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$", + "pattern": "^(nmdc):wfmtan-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$", "type": "string" }, "name": { @@ -6481,10 +6517,10 @@ "started_at_time", "ended_at_time" ], - "title": "MetatranscriptomeActivity", + "title": "MetatranscriptomeAnnotationActivity", "type": "object" }, - "MetatranscriptomeAnnotationActivity": { + "MetatranscriptomeAssembly": { "additionalProperties": false, "allOf": [ { @@ -6538,6 +6574,46 @@ }, "type": "array" }, + "asm_score": { + "description": "A score for comparing metagenomic assembly quality from same sample.", + "type": "number" + }, + "contig_bp": { + "description": "Total size in bp of all contigs.", + "type": "number" + }, + "contigs": { + "description": "The sum of the (length*log(length)) of all contigs, times some constant. Increase the contiguity, the score will increase", + "type": "number" + }, + "ctg_l50": { + "description": "Given a set of contigs, the L50 is defined as the sequence length of the shortest contig at 50% of the total genome length.", + "type": "number" + }, + "ctg_l90": { + "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all contigs of that length or longer contains at least 90% of the sum of the lengths of all contigs.", + "type": "number" + }, + "ctg_logsum": { + "description": "Maximum contig length.", + "type": "number" + }, + "ctg_max": { + "description": "Maximum contig length.", + "type": "number" + }, + "ctg_n50": { + "description": "Given a set of contigs, each with its own length, the N50 count is defined as the smallest number_of_contigs whose length sum makes up half of genome size.", + "type": "number" + }, + "ctg_n90": { + "description": "Given a set of contigs, each with its own length, the N90 count is defined as the smallest number of contigs whose length sum makes up 90% of genome size.", + "type": "number" + }, + "ctg_powsum": { + "description": "Powersum of all contigs is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", + "type": "number" + }, "ended_at_time": { "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", "type": "string" @@ -6545,17 +6621,21 @@ "execution_resource": { "type": "string" }, + "gap_pct": { + "description": "The gap size percentage of all scaffolds.", + "type": "number" + }, + "gc_avg": { + "description": "Average of GC content of all contigs.", + "type": "number" + }, + "gc_std": { + "description": "Standard deviation of GC content of all contigs.", + "type": "number" + }, "git_url": { "type": "string" }, - "gold_analysis_project_identifiers": { - "description": "identifiers for corresponding analysis project in GOLD", - "items": { - "pattern": "^gold:Ga[0-9]+$", - "type": "string" - }, - "type": "array" - }, "has_failure_categorization": { "items": { "$ref": "#/$defs/FailureCategorization" @@ -6580,13 +6660,25 @@ }, "id": { "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmtan-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$", + "pattern": "^(nmdc):wfmtas-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$", + "type": "string" + }, + "insdc_assembly_identifiers": { + "pattern": "^insdc.sra:[A-Z]+[0-9]+(\\.[0-9]+)?$", "type": "string" }, "name": { "description": "A human readable label for an entity", "type": "string" }, + "num_aligned_reads": { + "description": "The sequence count number of input reads aligned to assembled contigs.", + "type": "number" + }, + "num_input_reads": { + "description": "The sequence count number of input reads for assembly.", + "type": "number" + }, "part_of": { "description": "Links a resource to another resource that either logically or physically includes it.", "items": { @@ -6602,6 +6694,54 @@ "$ref": "#/$defs/StatusEnum", "description": "Stores information about the result of a process (ie the process of sequencing a library may have for qc_status of 'fail' if not enough data was generated)" }, + "scaf_bp": { + "description": "Total size in bp of all scaffolds.", + "type": "number" + }, + "scaf_l50": { + "description": "Given a set of scaffolds, the L50 is defined as the sequence length of the shortest scaffold at 50% of the total genome length.", + "type": "number" + }, + "scaf_l90": { + "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all scaffolds of that length or longer contains at least 90% of the sum of the lengths of all scaffolds.", + "type": "number" + }, + "scaf_l_gt50k": { + "description": "Total size in bp of all scaffolds greater than 50 KB.", + "type": "number" + }, + "scaf_logsum": { + "description": "The sum of the (length*log(length)) of all scaffolds, times some constant. Increase the contiguity, the score will increase", + "type": "number" + }, + "scaf_max": { + "description": "Maximum scaffold length.", + "type": "number" + }, + "scaf_n50": { + "description": "Given a set of scaffolds, each with its own length, the N50 count is defined as the smallest number of scaffolds whose length sum makes up half of genome size.", + "type": "number" + }, + "scaf_n90": { + "description": "Given a set of scaffolds, each with its own length, the N90 count is defined as the smallest number of scaffolds whose length sum makes up 90% of genome size.", + "type": "number" + }, + "scaf_n_gt50k": { + "description": "Total sequence count of scaffolds greater than 50 KB.", + "type": "number" + }, + "scaf_pct_gt50k": { + "description": "Total sequence size percentage of scaffolds greater than 50 KB.", + "type": "number" + }, + "scaf_powsum": { + "description": "Powersum of all scaffolds is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", + "type": "number" + }, + "scaffolds": { + "description": "Total sequence count of all scaffolds.", + "type": "number" + }, "started_at_time": { "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", "type": "string" @@ -6622,18 +6762,18 @@ } }, "required": [ - "type", "execution_resource", "git_url", "has_input", + "type", "id", "started_at_time", "ended_at_time" ], - "title": "MetatranscriptomeAnnotationActivity", + "title": "MetatranscriptomeAssembly", "type": "object" }, - "MetatranscriptomeAssembly": { + "MetatranscriptomeExpressionAnalysis": { "additionalProperties": false, "allOf": [ { @@ -6677,7 +6817,7 @@ } } ], - "description": "", + "description": "A workflow process that provides expression values and read counts for gene features predicted on the contigs.", "properties": { "alternative_identifiers": { "description": "A list of alternative identifiers for the entity.", @@ -6687,46 +6827,6 @@ }, "type": "array" }, - "asm_score": { - "description": "A score for comparing metagenomic assembly quality from same sample.", - "type": "number" - }, - "contig_bp": { - "description": "Total size in bp of all contigs.", - "type": "number" - }, - "contigs": { - "description": "The sum of the (length*log(length)) of all contigs, times some constant. Increase the contiguity, the score will increase", - "type": "number" - }, - "ctg_l50": { - "description": "Given a set of contigs, the L50 is defined as the sequence length of the shortest contig at 50% of the total genome length.", - "type": "number" - }, - "ctg_l90": { - "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all contigs of that length or longer contains at least 90% of the sum of the lengths of all contigs.", - "type": "number" - }, - "ctg_logsum": { - "description": "Maximum contig length.", - "type": "number" - }, - "ctg_max": { - "description": "Maximum contig length.", - "type": "number" - }, - "ctg_n50": { - "description": "Given a set of contigs, each with its own length, the N50 count is defined as the smallest number_of_contigs whose length sum makes up half of genome size.", - "type": "number" - }, - "ctg_n90": { - "description": "Given a set of contigs, each with its own length, the N90 count is defined as the smallest number of contigs whose length sum makes up 90% of genome size.", - "type": "number" - }, - "ctg_powsum": { - "description": "Powersum of all contigs is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", - "type": "number" - }, "ended_at_time": { "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", "type": "string" @@ -6734,18 +6834,6 @@ "execution_resource": { "type": "string" }, - "gap_pct": { - "description": "The gap size percentage of all scaffolds.", - "type": "number" - }, - "gc_avg": { - "description": "Average of GC content of all contigs.", - "type": "number" - }, - "gc_std": { - "description": "Standard deviation of GC content of all contigs.", - "type": "number" - }, "git_url": { "type": "string" }, @@ -6773,25 +6861,13 @@ }, "id": { "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmtas-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$", - "type": "string" - }, - "insdc_assembly_identifiers": { - "pattern": "^insdc.sra:[A-Z]+[0-9]+(\\.[0-9]+)?$", + "pattern": "^(nmdc):wfmtex-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[0-9]{1,})$", "type": "string" }, "name": { "description": "A human readable label for an entity", "type": "string" }, - "num_aligned_reads": { - "description": "The sequence count number of input reads aligned to assembled contigs.", - "type": "number" - }, - "num_input_reads": { - "description": "The sequence count number of input reads for assembly.", - "type": "number" - }, "part_of": { "description": "Links a resource to another resource that either logically or physically includes it.", "items": { @@ -6807,54 +6883,6 @@ "$ref": "#/$defs/StatusEnum", "description": "Stores information about the result of a process (ie the process of sequencing a library may have for qc_status of 'fail' if not enough data was generated)" }, - "scaf_bp": { - "description": "Total size in bp of all scaffolds.", - "type": "number" - }, - "scaf_l50": { - "description": "Given a set of scaffolds, the L50 is defined as the sequence length of the shortest scaffold at 50% of the total genome length.", - "type": "number" - }, - "scaf_l90": { - "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all scaffolds of that length or longer contains at least 90% of the sum of the lengths of all scaffolds.", - "type": "number" - }, - "scaf_l_gt50k": { - "description": "Total size in bp of all scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_logsum": { - "description": "The sum of the (length*log(length)) of all scaffolds, times some constant. Increase the contiguity, the score will increase", - "type": "number" - }, - "scaf_max": { - "description": "Maximum scaffold length.", - "type": "number" - }, - "scaf_n50": { - "description": "Given a set of scaffolds, each with its own length, the N50 count is defined as the smallest number of scaffolds whose length sum makes up half of genome size.", - "type": "number" - }, - "scaf_n90": { - "description": "Given a set of scaffolds, each with its own length, the N90 count is defined as the smallest number of scaffolds whose length sum makes up 90% of genome size.", - "type": "number" - }, - "scaf_n_gt50k": { - "description": "Total sequence count of scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_pct_gt50k": { - "description": "Total sequence size percentage of scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_powsum": { - "description": "Powersum of all scaffolds is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", - "type": "number" - }, - "scaffolds": { - "description": "Total sequence count of all scaffolds.", - "type": "number" - }, "started_at_time": { "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", "type": "string" @@ -6875,15 +6903,15 @@ } }, "required": [ + "type", "execution_resource", "git_url", "has_input", - "type", "id", "started_at_time", "ended_at_time" ], - "title": "MetatranscriptomeAssembly", + "title": "MetatranscriptomeExpressionAnalysis", "type": "object" }, "MixingProcess": { @@ -8678,6 +8706,15 @@ "title": "StatusEnum", "type": "string" }, + "StrandedOrientationEnum": { + "description": "This enumeration specifies information about stranded RNA library preparations.", + "enum": [ + "antisense orientation", + "sense orientation" + ], + "title": "StrandedOrientationEnum", + "type": "string" + }, "Study": { "additionalProperties": false, "description": "A study summarizes the overall goal of a research initiative and outlines the key objective of its underlying projects.", @@ -9475,10 +9512,24 @@ }, "type": "array" }, - "metatranscriptome_activity_set": { - "description": "This property links a database object to the set of metatranscriptome analysis activities.", + "metatranscriptome_annotation_set": { + "description": "This property links a database object to the set of metatranscriptome annotations within it.", + "items": { + "$ref": "#/$defs/MetatranscriptomeAnnotationActivity" + }, + "type": "array" + }, + "metatranscriptome_assembly_set": { + "description": "This property links a database object to the set of metatranscriptome assemblies within it.", + "items": { + "$ref": "#/$defs/MetatranscriptomeAssembly" + }, + "type": "array" + }, + "metatranscriptome_expression_analysis_set": { + "description": "This property links a database object to the set of metatranscriptome expression analysis activities.", "items": { - "$ref": "#/$defs/MetatranscriptomeActivity" + "$ref": "#/$defs/MetatranscriptomeExpressionAnalysis" }, "type": "array" }, diff --git a/nmdc_schema/nmdc_materialized_patterns.yaml b/nmdc_schema/nmdc_materialized_patterns.yaml index c4e691553c..f0908af7a4 100644 --- a/nmdc_schema/nmdc_materialized_patterns.yaml +++ b/nmdc_schema/nmdc_materialized_patterns.yaml @@ -603,6 +603,24 @@ types: uri: xsd:anyURI pattern: ^[a-zA-Z0-9][a-zA-Z0-9_\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\-\/\.,]*$ enums: + StrandedOrientationEnum: + name: StrandedOrientationEnum + description: This enumeration specifies information about stranded RNA library + preparations. + from_schema: https://w3id.org/nmdc/nmdc + permissible_values: + antisense orientation: + text: antisense orientation + description: Orientation that is complementary (non-coding) to a sequence + of messenger RNA. + comments: + - See https://www.genome.gov/genetics-glossary/antisense + exact_mappings: + - SO:0000077 + sense orientation: + text: sense orientation + description: Orientation that corresponds to the coding sequence of messenger + RNA. InstrumentModelEnum: name: InstrumentModelEnum from_schema: https://w3id.org/nmdc/nmdc @@ -1125,6 +1143,36 @@ enums: text: LC-DDA-MS/MS Raw Data description: Liquid chromatographically separated MS1 and Data-Dependent MS2 binary instrument file + Metatranscriptome Expression: + text: Metatranscriptome Expression + description: Metatranscriptome expression values and read counts for gene + features predicted on contigs + annotations: + file_name_pattern: + tag: file_name_pattern + value: '*.rnaseq_gea.txt' + Metatranscriptome Expression Intergenic: + text: Metatranscriptome Expression Intergenic + description: Metatranscriptome expression values and read counts for intergenic + regions. + annotations: + file_name_pattern: + tag: file_name_pattern + value: '*.rnaseq_gea.intergenic.txt' + Metatranscriptome Expression Info File: + text: Metatranscriptome Expression Info File + description: File containing version information on the expression workflow + annotations: + file_name_pattern: + tag: file_name_pattern + value: '*_readcount.info' + rRNA Filtered Sequencing Reads: + text: rRNA Filtered Sequencing Reads + description: File containing ribosomal reads from the read qc filtering step. + annotations: + file_name_pattern: + tag: file_name_pattern + value: '*.rRNA.fastq.gz' CreditEnum: name: CreditEnum comments: @@ -1356,10 +1404,18 @@ enums: MetagenomeAssembly: text: MetagenomeAssembly description: A failure has occurred in metagenome assembly, a workflow process. - MetatranscriptomeActivity: - text: MetatranscriptomeActivity - description: A failure has occurred in metatranscriptome analysis, a workflow - process. + MetatranscriptomeExpressionAnalysis: + text: MetatranscriptomeExpressionAnalysis + description: A failure has occurred in metatranscriptome expression analysis, + a workflow process. + MetatranscriptomeAnnotation: + text: MetatranscriptomeAnnotation + description: A failure has occurred in metatranscriptome annotation analysis, + a workflow process. + MetatranscriptomeAssembly: + text: MetatranscriptomeAssembly + description: A failure has occurred in metatranscriptome assembly analysis, + a workflow process. MagsAnalysisActivity: text: MagsAnalysisActivity description: A failure has occurred in binning, a workflow process to generate @@ -3667,6 +3723,9 @@ slots: domain: FunctionalAnnotationAggMember range: MetagenomeAnnotationActivity required: true + any_of: + - range: MetagenomeAnnotationActivity + - range: MetatranscriptomeAnnotationActivity gene_function_id: name: gene_function_id description: The identifier for the gene function. @@ -3746,6 +3805,18 @@ slots: exact_mappings: - OBI:0002475 range: integer + is_stranded: + name: is_stranded + description: Is the (RNA) library stranded or non-stranded (unstranded). + comments: + - A value of true means the library is stranded, flase means non-stranded. + from_schema: https://w3id.org/nmdc/nmdc + range: boolean + stranded_orientation: + name: stranded_orientation + description: Lists the strand orientiation for a stranded RNA library preparation. + from_schema: https://w3id.org/nmdc/nmdc + range: StrandedOrientationEnum mass: name: mass description: A physical quality that inheres in a bearer by virtue of the proportion @@ -4211,15 +4282,15 @@ slots: - object_set domain: Database range: MetagenomeSequencingActivity - metatranscriptome_activity_set: - name: metatranscriptome_activity_set + metatranscriptome_expression_analysis_set: + name: metatranscriptome_expression_analysis_set description: This property links a database object to the set of metatranscriptome - analysis activities. + expression analysis activities. from_schema: https://w3id.org/nmdc/nmdc mixins: - object_set domain: Database - range: MetatranscriptomeActivity + range: MetatranscriptomeExpressionAnalysis read_qc_analysis_activity_set: name: read_qc_analysis_activity_set description: This property links a database object to the set of read QC analysis @@ -4292,6 +4363,24 @@ slots: - object_set domain: Database range: LibraryPreparation + metatranscriptome_assembly_set: + name: metatranscriptome_assembly_set + description: This property links a database object to the set of metatranscriptome + assemblies within it. + from_schema: https://w3id.org/nmdc/nmdc + mixins: + - object_set + domain: Database + range: MetatranscriptomeAssembly + metatranscriptome_annotation_set: + name: metatranscriptome_annotation_set + description: This property links a database object to the set of metatranscriptome + annotations within it. + from_schema: https://w3id.org/nmdc/nmdc + mixins: + - object_set + domain: Database + range: MetatranscriptomeAnnotationActivity omics_type: name: omics_type description: The type of omics data @@ -4320,6 +4409,15 @@ slots: range: string gold_path_field: name: gold_path_field + annotations: + tooltip: + tag: tooltip + value: GOLD Ecosystem Classification paths describe the surroundings from + which an environmental sample or an organism is collected. + annotations: + source: + tag: source + value: https://gold.jgi.doe.gov/ecosystem_classification description: This is a grouping for any of the gold path fields from_schema: https://w3id.org/nmdc/nmdc abstract: true @@ -17826,9 +17924,9 @@ slots: mappings: - prov:wasGeneratedBy range: Activity - pattern: ^^(nmdc):(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$|^^(nmdc):omprc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})$ + pattern: ^(nmdc):(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$|^^(nmdc):omprc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})$ structured_pattern: - syntax: ^{id_nmdc_prefix}:(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-{id_shoulder}-{id_blade}{id_version}$|^{id_nmdc_prefix}:omprc-{id_shoulder}-{id_blade}$ + syntax: '{id_nmdc_prefix}:(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-{id_shoulder}-{id_blade}{id_version}$|^{id_nmdc_prefix}:omprc-{id_shoulder}-{id_blade}$' interpolated: true any_of: - range: Activity @@ -18126,9 +18224,9 @@ classes: slot_usage: metagenome_annotation_id: name: metagenome_annotation_id - pattern: ^(nmdc):(wfmgan)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$ + pattern: ^(nmdc):(wfmgan|wfmtan)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$ structured_pattern: - syntax: '{id_nmdc_prefix}:(wfmgan)-{id_shoulder}-{id_blade}{id_version}$' + syntax: '{id_nmdc_prefix}:(wfmgan|wfmtan)-{id_shoulder}-{id_blade}{id_version}$' interpolated: true Database: name: Database @@ -18156,7 +18254,9 @@ classes: - metagenome_assembly_set - metagenome_sequencing_activity_set - metaproteomics_analysis_activity_set - - metatranscriptome_activity_set + - metatranscriptome_annotation_set + - metatranscriptome_assembly_set + - metatranscriptome_expression_analysis_set - nom_analysis_activity_set - omics_processing_set - planned_process_set @@ -18290,6 +18390,8 @@ classes: - library_preparation_kit - library_type - pcr_cycles + - is_stranded + - stranded_orientation slot_usage: has_input: name: has_input @@ -18418,9 +18520,9 @@ classes: interpolated: true was_generated_by: name: was_generated_by - pattern: ^^(nmdc):(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$|^^(nmdc):omprc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})$ + pattern: ^(nmdc):(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$|^^(nmdc):omprc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})$ structured_pattern: - syntax: ^{id_nmdc_prefix}:(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-{id_shoulder}-{id_blade}{id_version}$|^{id_nmdc_prefix}:omprc-{id_shoulder}-{id_blade}$ + syntax: '{id_nmdc_prefix}:(wfmag|wfmb|wfmgan|wfmgas|wfmsa|wfmp|wfmt|wfmtan|wfmtas|wfnom|wfrbt|wfrqc)-{id_shoulder}-{id_blade}{id_version}$|^{id_nmdc_prefix}:omprc-{id_shoulder}-{id_blade}$' interpolated: true Biosample: name: Biosample @@ -20740,10 +20842,10 @@ classes: structured_pattern: syntax: '{id_nmdc_prefix}:wfmtan-{id_shoulder}-{id_blade}{id_version}$' interpolated: true - MetatranscriptomeActivity: - name: MetatranscriptomeActivity - description: A metatranscriptome activity that e.g. pools assembly and annotation - activity. + MetatranscriptomeExpressionAnalysis: + name: MetatranscriptomeExpressionAnalysis + description: A workflow process that provides expression values and read counts + for gene features predicted on the contigs. in_subset: - workflow subset from_schema: https://w3id.org/nmdc/nmdc @@ -20754,9 +20856,9 @@ classes: id: name: id required: true - pattern: ^(nmdc):wfmt-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$ + pattern: ^(nmdc):wfmtex-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$ structured_pattern: - syntax: '{id_nmdc_prefix}:wfmt-{id_shoulder}-{id_blade}{id_version}$' + syntax: '{id_nmdc_prefix}:wfmtex-{id_shoulder}-{id_blade}{id_version}$' interpolated: true MagsAnalysisActivity: name: MagsAnalysisActivity