src/main/resources/avro/metadata.avdl

@namespace("org.ga4gh.models")

/**
This protocol defines metadata used in the other GA4GH protocols.
*/

protocol Metadata {

import idl "common.avdl";

/**
An experimental preparation of a `Sample`.
*/
record Experiment {
  /** The experiment UUID. This is globally unique. */
  string id;

  /** The name of the experiment. */
  union { null, string } name = null;

  /** A description of the experiment. */
  union { null, string } description = null;

  /**
  The time at which this record was created. 
  Format: ISO 8601, YYYY-MM-DDTHH:MM:SS.SSS (e.g. 2015-02-10T00:03:42.123Z)
  */
  string recordCreateTime;

  /**
  The time at which this record was last updated.
  Format: ISO 8601, YYYY-MM-DDTHH:MM:SS.SSS (e.g. 2015-02-10T00:03:42.123Z)
  */
  string recordUpdateTime;

  /**
  The time at which this experiment was performed.
  Granularity here is variabel (e.g. date only).
  Format: ISO 8601, YYYY-MM-DDTHH:MM:SS (e.g. 2015-02-10T00:03:42)
  */
  union { null, string } runTime = null;

  /**
  The molecule examined in this experiment. (e.g. genomics DNA, total RNA)
  */
  union { null, string } molecule = null;

  /**
  The experiment technique or strategy applied to the sample.
  (e.g. whole genome sequencing, RNA-seq, RIP-seq)
  */
  union { null, string } strategy = null;

  /**
  The method used to enrich the target. (e.g. immunoprecipitation, size
  fractionation, MNase digestion)
  */
  union { null, string } selection = null;

  /** The name of the library used as part of this experiment. */
  union { null, string } library = null;

  /** The configuration of sequenced reads. (e.g. Single or Paired) */
  union { null, string } libraryLayout = null;

  /**
    The instrument model used as part of this experiment.
    This maps to sequencing technology in BAM.
  */
  union { null, string } instrumentModel;

  /**
  The data file generated by the instrument.
  TODO: This isn't actually a file is it?
  Should this be `instrumentData` instead?
  */
  union { null, string } instrumentDataFile = null;

  /** The sequencing center used as part of this experiment. */
  union { null, string } sequencingCenter;

  /**
  The platform unit used as part of this experiment. This is a flowcell-barcode
  or slide unique identifier.
  */
  union { null, string } platformUnit = null;

  /**
  A map of additional experiment information.
  */
  map<array<string>> info = {};
}

/**
NOTE: there's ongoing discussion about changing the role of Dataset, possibly as follows:

Represents a group of contextually related data objects of (e.g. all Individuals, Samples, 
Experiments associated with a particular feature; or e.g. a trio in genetic diagnostics.).
This concept may be expanded in the future (ontology for describing the type of dataset ...).
TODO: Determination of scope, structure, specific attributes, e.g. limiting to single 
record type - see http://purl.obolibrary.org/obo/IAO_0000100 - and providing alternative mechanism 
for heterogeneous data with external contextualization, e.g. all records of different 
types associated with a clinical study.
*/

/**
A Dataset is a data-provider-specified collection of related data of multiple types.
Logically, it's akin to a folder -- it's up to the provider what goes into the folder.

For server implementors, they're a useful level of granularity for implementing
administrative features such as access control (e.g. Dataset X is public;
Dataset Y is only available to lab Z's collaborators) and billing (e.g. the costs
of hosting Dataset Y should be charged to lab Z).

For data curators, they're 'the simplest thing that could possibly work' for grouping
data (e.g. Dataset X has all the reads, variants, and expression levels for a
particular research project; Dataset Y has all the work product from a particular
grant).

For data accessors, they're a simple way to scope exploration and analysis
(e.g. are there any supporting examples in 1000genomes?
what's the distribution of that result in the data from our project?)   
*/
record Dataset {
  /**
  The dataset's id, (at least) locally unique.
  */
  string id;

  /**
  The name of the dataset.
  */
  union { null, string } name = null;

  /**
  Additional, human-readable information on the dataset.
  */
  union { null, string } description = null;

}

}