Merge pull request #322 from laserkelvin/hdf5-datasets

Data schema and HDF5 data support
IntelLabs · Dec 13, 2024 · dc6a125 · dc6a125
2 parents 4c69638 + 00d2f63
commit dc6a125
Show file tree

Hide file tree

Showing 12 changed files with 2,137 additions and 46 deletions.
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
@@ -0,0 +1,29 @@
+FROM amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly-xpu
+# for setup run with root
+USER 0
+# needed to make sure mamba environment is activated
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+# install packages, particularly xpu torch from nightly wheels
+RUN pip install torch-geometric
+# DGl is currently unsupported, and uses libtorch so we need to build it
+WORKDIR /opt/matsciml
+COPY . .
+RUN pip install -e .
+RUN git clone --recurse-submodules https://github.com/dmlc/dgl.git /opt/dgl
+ENV DGL_HOME=/opt/dgl
+WORKDIR /opt/dgl/build
+RUN cmake -DUSE_CUDA=OFF -DPython3_EXECUTABLE=/opt/conda/bin/python .. && make
+WORKDIR /opt/dgl/python
+RUN pip install .
+RUN micromamba clean --all --yes && rm -rf /opt/xpu-backend /var/lib/apt/lists/*
+# make conda read-writable for user
+RUN chown -R $MAMBA_USER:$MAMBA_USER /opt/matsciml && chown -R $MAMBA_USER:$MAMBA_USER /opt/conda
+# change back to non-root user
+USER $MAMBA_USER
+LABEL org.opencontainers.image.authors="Kin Long Kelvin Lee"
+LABEL org.opencontainers.image.vendor="Intel Labs"
+LABEL org.opencontainers.image.base.name="amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly"
+LABEL org.opencontainers.image.title="kinlongk-pytorch"
+LABEL org.opencontainers.image.description="XPU enabled PyTorch+Triton from Github artifact wheel builds."
+HEALTHCHECK NONE
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
@@ -0,0 +1,61 @@
+FROM mambaorg/micromamba:noble AS oneapi_xpu
+SHELL ["/bin/bash", "-c"]
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+# for setup run with root
+USER 0
+# needed to make sure mamba environment is activated
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+# VGID as an environment variable specifying the group ID for render on host
+# this needs to match, otherwise non-root users will not pick up cards
+ARG VGID=993
+# install firmware, oneAPI, etc.
+RUN apt-get update -y && apt-get install -y software-properties-common wget git make g++ gcc gpg-agent
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+    | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
+RUN echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" > /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
+    | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
+RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu noble unified" > /etc/apt/sources.list.d/intel-gpu-noble.list
+RUN apt-get update -y && \
+	apt-get upgrade -y && \
+	apt-get install -y git make g++ gcc gpg-agent wget \
+        intel-for-pytorch-gpu-dev-0.5 \
+        intel-pti-dev \
+        cmake \
+        tzdata \
+        zlib1g zlib1g-dev \
+        xpu-smi \
+        intel-opencl-icd intel-level-zero-gpu libze1 intel-oneapi-mpi \
+        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+        libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+# make sure oneAPI components are in environment variables
+RUN source /opt/intel/oneapi/setvars.sh
+# make it so you don't have to source oneAPI every time
+COPY entrypoint.sh /usr/local/bin/entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+FROM oneapi_xpu
+# set aliases so python and pip are met
+RUN ln -s /opt/conda/bin/python /usr/local/bin/python && ln -s /opt/conda/bin/pip /usr/local/bin/pip
+# clone matsciml into container and install
+RUN git clone https://github.com/IntelLabs/matsciml /opt/matsciml
+WORKDIR /opt/matsciml
+# install packages, particularly xpu torch from nightly wheels
+RUN micromamba install -y -n base -f conda.yml && \
+    pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/xpu && \
+    pip install -e './[all]'
+RUN micromamba clean --all --yes && rm -rf /opt/xpu-backend /var/lib/apt/lists/*
+# let non-root mamba user have access to GPUS
+RUN groupadd -g $VGID render && usermod -a -G video,render $MAMBA_USER
+# make conda read-writable for user
+RUN chown -R $MAMBA_USER:$MAMBA_USER /opt/matsciml && chown -R $MAMBA_USER:$MAMBA_USER /opt/conda
+# change back to non-root user
+USER $MAMBA_USER
+LABEL org.opencontainers.image.authors="Kin Long Kelvin Lee"
+LABEL org.opencontainers.image.vendor="Intel Labs"
+LABEL org.opencontainers.image.base.name="amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly"
+LABEL org.opencontainers.image.title="kinlongk-pytorch"
+LABEL org.opencontainers.image.description="XPU enabled PyTorch+Triton from Github artifact wheel builds."
+HEALTHCHECK NONE
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# this sources oneAPI components and silences the output so we don't
+# have to see the wall of text every time we enter the container
+source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1
+exec "$@"
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -12,6 +12,7 @@ The Open MatSciML Toolkit
 
    Getting started <self>
    datasets
+   schema
    transforms
    models
    training

diff --git a/docs/source/schema.rst b/docs/source/schema.rst
@@ -0,0 +1,181 @@
+Schema
+==========
+
+The Open MatSciML Toolkit tries to place emphasis on reproducibility, and the general
+rule of "explicit is better than implicit" by defining schema for data and other development
+concepts.
+
+The intention is to move away from hardcoded ``Dataset`` classes that are rigid in
+that they require writing code, as well as not always reliably reproducible as the
+underlying data and frameworks change and evolve over time. Instead, the schema
+provided in ``matsciml`` tries to shift technical debt from maintaining code to
+**documenting** data, which assuming a thorough and complete description, should
+in principle be usable regardless of breaking API changes in frameworks that we rely
+on like ``pymatgen``, ``torch_geometric``, and so on. As a dataset is being packaged
+for distribution/defined, the schema should also make intentions of the developer clear
+to the end-user, e.g. what target label is available, how it was calculated, and so on,
+to help subsequent reproduction efforts. As an effect, this also makes development of
+``matsciml`` a lot more streamlined, as it then homogenizes field names (i.e. we can
+reliably expect ``cart_coords`` to be available and are cartesian coordinates).
+
+.. TIP::
+   You do not have to construct objects contained in schema if they are ``pydantic``
+   models themselves: for example, the ``PeriodicBoundarySchema`` is required in
+   ``DataSampleSchema``, but you can alternatively just pass a dictionary with the
+   expected key/value mappings (i.e. ``{'x': True, 'y': True, 'z': False}``) for
+   the relevant schema.
+
+
+Dataset schema reference
+########################
+
+This schema lays out what can be described as metadata for a dataset. We define all of
+the expected fields in ``targets``, and record checksums for each dataset split such
+that we can record what model was trained on what specific split. Currently, it is the
+responsibility of the dataset distributor to record this metadata for their dataset,
+and package it as a ``metadata.json`` file in the same folder as the HDF5 files.
+
+.. autoclass:: matsciml.datasets.schema.DatasetSchema
+   :members:
+
+Data sample schema reference
+############################
+
+This schema comprises a **single** data sample, providing standardized field names for
+a host of commonly used properties. Most properties are optional for the class construction,
+but we highly recommend perusing the fields shown below to find the attribute closest to
+the property being recorded: ``pydantic`` does not allow arbitrary attributes to be stored
+in schema, but non-standard properties can be stashed away in ``extras`` as a dictionary of
+property name/values.
+
+.. autoclass:: matsciml.datasets.schema.DataSampleSchema
+   :members:
+
+
+Creating datasets with schema
+#############################
+
+.. NOTE::
+   This section is primarily for people interested in developing new datasets.
+
+The premise behind defining these schema rigorously is to encourage reproducible workflows
+with (hopefully) less technical debt: we can safely rely on data to validate itself,
+catch mistakes when serializing/dumping data for others to use, and set reasonable expectations
+on what data will be available at what parts of training and evaluation. For those interested
+in creating their own datasets, this section lays out some preliminary notes on how to wrangle
+your data to adhere to schema, and make the data ready to be used by the pipeline.
+
+Matching your data to the schema
+================================
+
+First step in dataset creation is taking whatever primary data format you have, and mapping
+them to the ``DataSampleSchema`` laid out above. The required keys include ``index``, ``cart_coords``,
+and so on, and by definition need to be provided. The code below shows an example loop
+over a list of data, which we convert into a dictionary with the same keys as expected in ``DataSampleSchema``:
+
+.. ::
+ :caption: Example abstract code for mapping your data to the schema
+   all_data = ...  # should be a list of samples
+   samples = []
+   for index, data in enumerate(all_data):
+       temp_dict = {}
+       temp_dict["cart_coords"] = data.positions
+       temp_dict['index'] = index
+       temp_dict['datatype'] = "OptimizationCycle"   # must be one of the enums
+       temp_dict['num_atoms'] = len(data.positions)
+       schema = DataSampleSchema(**temp_dict)
+       samples.append(schema)
+
+You end up with a list of ``DataSampleSchema`` which undergo all of the validation
+and consistency checks.
+
+Data splits
+======================
+
+At this point you could call it a day, but if we want to create uniform random
+training and validation splits, this is a good point to do so. The code below
+shows one way of generating the splits: keep in mind that this mechanism for
+splitting might not be appropriate for your data - to mitigate data leakage,
+you may need to consider using more sophisticated algorithms that consider chemical
+elements, de-correlate dynamics, etc. Treat the code below as boilerplate, and
+modify it as needed.
+
+.. ::
+   :caption: Example code showing how to generate training and validation splits
+   import numpy as np
+   import h5py
+   from matsciml.datasets.generic import write_data_to_hdf5_group
+
+   SEED = 73926   # this will be reused when generating the metadata
+   rng =  np.random.default_rng(SEED)
+
+   all_indices = np.arange(len(samples))
+   val_split = int(len(samples) * 0.2)
+   rng.shuffle(all_indices)
+   train_indices = all_indices[val_split:]
+   val_indices = all_indices[:val_split]
+
+   # instantiate HDF5 files
+   train_h5 = h5py.File("./train.h5", mode="w")
+
+   for index in train_indices:
+        sample = samples[index]
+        # store each data sample as a group comprising array data
+        group = train_h5.create_group(str(index))
+        # takes advantage of pydantic serialization
+        for key, value in sample.model_dump(round_trip=True).items():
+            if value is not None:
+                write_data_to_hdf5_group(key, value, group)
+
+
+Repeat the loop above for your validation set.
+
+Dataset metadata
+==================
+
+Once we have created these splits, there's a bunch of metadata associated
+with **how** we created the splits that we should record so that at runtime,
+there's no ambiguity which data and splits are being used and where they
+came from.
+
+.. ::
+   from datetime import datetime
+   from matsciml.datasets.generic import MatSciMLDataset
+   from matsciml.datasets.schema import DatasetSchema
+
+   # use the datasets we created above; `strict_checksum` needs to be
+   # set the False here because we're going to be generating the checksum
+   train_dset = MatSciMLDataset("./train.h5", strict_checksum=False)
+   train_checksum = train_dset.blake2s_checksum
+
+   # fill in the dataset metadata schema
+   dset_schema = DatasetSchema(
+       name="My new dataset",
+       creation=datetime.now(),
+       split_blake2s={
+           "train": train_checksum,
+           "validation": ...,
+           "test": ...,  # these should be made the same way as the training set
+       },
+       targets=[...],   # see below
+       dataset_type="OptimizationCycle",   # choose one of the `DataTypeEnum`
+       seed=SEED,    # from the first code snippet
+   )
+   # writes the schema where it's expected
+   dset.to_json_file("metadata.json")
+
+
+Hopefully you can appreciate that the metadata is meant to lessen the burden
+of future users of the dataset (including yourself!). The last thing to cover
+here is that ``targets`` was omitted in the snippet above: this field is meant
+for you to record every property that may or may not be part of the standard
+``DataSampleSchema`` which is intended to be used throughout training. This is
+the ``TargetSchema``: you must detail the name, expected shape, and a short
+description of every property (including the standard ones). The main motivation
+for this is that ``total_energy`` for one dataset may mean something very different
+between one dataset to the next (electronic energy? thermodynamic corrections?),
+and specifying this for the end user will remove any ambiguities.
+
+
+.. autoclass:: matsciml.datasets.schema.TargetSchema
+   :members:
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# this sources oneAPI components and silences the output so we don't
+# have to see the wall of text every time we enter the container
+source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1
+exec "$@"
diff --git a/matsciml/datasets/__init__.py b/matsciml/datasets/__init__.py
@@ -20,6 +20,8 @@
 from matsciml.datasets.ocp_datasets import IS2REDataset, S2EFDataset
 from matsciml.datasets.oqmd import OQMDDataset
 from matsciml.datasets.symmetry import SyntheticPointGroupDataset
+from matsciml.datasets.schema import DatasetSchema, DataSampleSchema
+from matsciml.datasets.generic import MatSciMLDataset, MatSciMLDataModule
 
 __all__ = [
     "AlexandriaDataset",
@@ -34,4 +36,8 @@
     "SyntheticPointGroupDataset",
     "MultiDataset",
     "ColabFitDataset",
+    "DatasetSchema",
+    "DataSampleSchema",
+    "MatSciMLDataModule",
+    "MatSciMLDataset",
 ]