Skip to content

Commit

Permalink
Merge pull request #322 from laserkelvin/hdf5-datasets
Browse files Browse the repository at this point in the history
Data schema and HDF5 data support
  • Loading branch information
laserkelvin authored Dec 13, 2024
2 parents 4c69638 + 00d2f63 commit dc6a125
Show file tree
Hide file tree
Showing 12 changed files with 2,137 additions and 46 deletions.
29 changes: 29 additions & 0 deletions Dockerfile.xpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
FROM amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly-xpu
# for setup run with root
USER 0
# needed to make sure mamba environment is activated
ARG MAMBA_DOCKERFILE_ACTIVATE=1
# install packages, particularly xpu torch from nightly wheels
RUN pip install torch-geometric
# DGl is currently unsupported, and uses libtorch so we need to build it
WORKDIR /opt/matsciml
COPY . .
RUN pip install -e .
RUN git clone --recurse-submodules https://github.com/dmlc/dgl.git /opt/dgl
ENV DGL_HOME=/opt/dgl
WORKDIR /opt/dgl/build
RUN cmake -DUSE_CUDA=OFF -DPython3_EXECUTABLE=/opt/conda/bin/python .. && make
WORKDIR /opt/dgl/python
RUN pip install .
RUN micromamba clean --all --yes && rm -rf /opt/xpu-backend /var/lib/apt/lists/*
# make conda read-writable for user
RUN chown -R $MAMBA_USER:$MAMBA_USER /opt/matsciml && chown -R $MAMBA_USER:$MAMBA_USER /opt/conda
# change back to non-root user
USER $MAMBA_USER
LABEL org.opencontainers.image.authors="Kin Long Kelvin Lee"
LABEL org.opencontainers.image.vendor="Intel Labs"
LABEL org.opencontainers.image.base.name="amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly"
LABEL org.opencontainers.image.title="kinlongk-pytorch"
LABEL org.opencontainers.image.description="XPU enabled PyTorch+Triton from Github artifact wheel builds."
HEALTHCHECK NONE
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
61 changes: 61 additions & 0 deletions docker/Dockerfile.xpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
FROM mambaorg/micromamba:noble AS oneapi_xpu
SHELL ["/bin/bash", "-c"]
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
# for setup run with root
USER 0
# needed to make sure mamba environment is activated
ARG MAMBA_DOCKERFILE_ACTIVATE=1
# VGID as an environment variable specifying the group ID for render on host
# this needs to match, otherwise non-root users will not pick up cards
ARG VGID=993
# install firmware, oneAPI, etc.
RUN apt-get update -y && apt-get install -y software-properties-common wget git make g++ gcc gpg-agent
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
RUN echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" > /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
| gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu noble unified" > /etc/apt/sources.list.d/intel-gpu-noble.list
RUN apt-get update -y && \
apt-get upgrade -y && \
apt-get install -y git make g++ gcc gpg-agent wget \
intel-for-pytorch-gpu-dev-0.5 \
intel-pti-dev \
cmake \
tzdata \
zlib1g zlib1g-dev \
xpu-smi \
intel-opencl-icd intel-level-zero-gpu libze1 intel-oneapi-mpi \
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
# make sure oneAPI components are in environment variables
RUN source /opt/intel/oneapi/setvars.sh
# make it so you don't have to source oneAPI every time
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
FROM oneapi_xpu
# set aliases so python and pip are met
RUN ln -s /opt/conda/bin/python /usr/local/bin/python && ln -s /opt/conda/bin/pip /usr/local/bin/pip
# clone matsciml into container and install
RUN git clone https://github.com/IntelLabs/matsciml /opt/matsciml
WORKDIR /opt/matsciml
# install packages, particularly xpu torch from nightly wheels
RUN micromamba install -y -n base -f conda.yml && \
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/xpu && \
pip install -e './[all]'
RUN micromamba clean --all --yes && rm -rf /opt/xpu-backend /var/lib/apt/lists/*
# let non-root mamba user have access to GPUS
RUN groupadd -g $VGID render && usermod -a -G video,render $MAMBA_USER
# make conda read-writable for user
RUN chown -R $MAMBA_USER:$MAMBA_USER /opt/matsciml && chown -R $MAMBA_USER:$MAMBA_USER /opt/conda
# change back to non-root user
USER $MAMBA_USER
LABEL org.opencontainers.image.authors="Kin Long Kelvin Lee"
LABEL org.opencontainers.image.vendor="Intel Labs"
LABEL org.opencontainers.image.base.name="amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly"
LABEL org.opencontainers.image.title="kinlongk-pytorch"
LABEL org.opencontainers.image.description="XPU enabled PyTorch+Triton from Github artifact wheel builds."
HEALTHCHECK NONE
5 changes: 5 additions & 0 deletions docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
# this sources oneAPI components and silences the output so we don't
# have to see the wall of text every time we enter the container
source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1
exec "$@"
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ The Open MatSciML Toolkit

Getting started <self>
datasets
schema
transforms
models
training
Expand Down
181 changes: 181 additions & 0 deletions docs/source/schema.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
Schema
==========

The Open MatSciML Toolkit tries to place emphasis on reproducibility, and the general
rule of "explicit is better than implicit" by defining schema for data and other development
concepts.

The intention is to move away from hardcoded ``Dataset`` classes that are rigid in
that they require writing code, as well as not always reliably reproducible as the
underlying data and frameworks change and evolve over time. Instead, the schema
provided in ``matsciml`` tries to shift technical debt from maintaining code to
**documenting** data, which assuming a thorough and complete description, should
in principle be usable regardless of breaking API changes in frameworks that we rely
on like ``pymatgen``, ``torch_geometric``, and so on. As a dataset is being packaged
for distribution/defined, the schema should also make intentions of the developer clear
to the end-user, e.g. what target label is available, how it was calculated, and so on,
to help subsequent reproduction efforts. As an effect, this also makes development of
``matsciml`` a lot more streamlined, as it then homogenizes field names (i.e. we can
reliably expect ``cart_coords`` to be available and are cartesian coordinates).

.. TIP::
You do not have to construct objects contained in schema if they are ``pydantic``
models themselves: for example, the ``PeriodicBoundarySchema`` is required in
``DataSampleSchema``, but you can alternatively just pass a dictionary with the
expected key/value mappings (i.e. ``{'x': True, 'y': True, 'z': False}``) for
the relevant schema.


Dataset schema reference
########################

This schema lays out what can be described as metadata for a dataset. We define all of
the expected fields in ``targets``, and record checksums for each dataset split such
that we can record what model was trained on what specific split. Currently, it is the
responsibility of the dataset distributor to record this metadata for their dataset,
and package it as a ``metadata.json`` file in the same folder as the HDF5 files.

.. autoclass:: matsciml.datasets.schema.DatasetSchema
:members:

Data sample schema reference
############################

This schema comprises a **single** data sample, providing standardized field names for
a host of commonly used properties. Most properties are optional for the class construction,
but we highly recommend perusing the fields shown below to find the attribute closest to
the property being recorded: ``pydantic`` does not allow arbitrary attributes to be stored
in schema, but non-standard properties can be stashed away in ``extras`` as a dictionary of
property name/values.

.. autoclass:: matsciml.datasets.schema.DataSampleSchema
:members:


Creating datasets with schema
#############################

.. NOTE::
This section is primarily for people interested in developing new datasets.

The premise behind defining these schema rigorously is to encourage reproducible workflows
with (hopefully) less technical debt: we can safely rely on data to validate itself,
catch mistakes when serializing/dumping data for others to use, and set reasonable expectations
on what data will be available at what parts of training and evaluation. For those interested
in creating their own datasets, this section lays out some preliminary notes on how to wrangle
your data to adhere to schema, and make the data ready to be used by the pipeline.

Matching your data to the schema
================================

First step in dataset creation is taking whatever primary data format you have, and mapping
them to the ``DataSampleSchema`` laid out above. The required keys include ``index``, ``cart_coords``,
and so on, and by definition need to be provided. The code below shows an example loop
over a list of data, which we convert into a dictionary with the same keys as expected in ``DataSampleSchema``:

.. ::
:caption: Example abstract code for mapping your data to the schema
all_data = ... # should be a list of samples
samples = []
for index, data in enumerate(all_data):
temp_dict = {}
temp_dict["cart_coords"] = data.positions
temp_dict['index'] = index
temp_dict['datatype'] = "OptimizationCycle" # must be one of the enums
temp_dict['num_atoms'] = len(data.positions)
schema = DataSampleSchema(**temp_dict)
samples.append(schema)

You end up with a list of ``DataSampleSchema`` which undergo all of the validation
and consistency checks.

Data splits
======================

At this point you could call it a day, but if we want to create uniform random
training and validation splits, this is a good point to do so. The code below
shows one way of generating the splits: keep in mind that this mechanism for
splitting might not be appropriate for your data - to mitigate data leakage,
you may need to consider using more sophisticated algorithms that consider chemical
elements, de-correlate dynamics, etc. Treat the code below as boilerplate, and
modify it as needed.

.. ::
:caption: Example code showing how to generate training and validation splits
import numpy as np
import h5py
from matsciml.datasets.generic import write_data_to_hdf5_group

SEED = 73926 # this will be reused when generating the metadata
rng = np.random.default_rng(SEED)

all_indices = np.arange(len(samples))
val_split = int(len(samples) * 0.2)
rng.shuffle(all_indices)
train_indices = all_indices[val_split:]
val_indices = all_indices[:val_split]

# instantiate HDF5 files
train_h5 = h5py.File("./train.h5", mode="w")

for index in train_indices:
sample = samples[index]
# store each data sample as a group comprising array data
group = train_h5.create_group(str(index))
# takes advantage of pydantic serialization
for key, value in sample.model_dump(round_trip=True).items():
if value is not None:
write_data_to_hdf5_group(key, value, group)


Repeat the loop above for your validation set.

Dataset metadata
==================

Once we have created these splits, there's a bunch of metadata associated
with **how** we created the splits that we should record so that at runtime,
there's no ambiguity which data and splits are being used and where they
came from.

.. ::
from datetime import datetime
from matsciml.datasets.generic import MatSciMLDataset
from matsciml.datasets.schema import DatasetSchema

# use the datasets we created above; `strict_checksum` needs to be
# set the False here because we're going to be generating the checksum
train_dset = MatSciMLDataset("./train.h5", strict_checksum=False)
train_checksum = train_dset.blake2s_checksum

# fill in the dataset metadata schema
dset_schema = DatasetSchema(
name="My new dataset",
creation=datetime.now(),
split_blake2s={
"train": train_checksum,
"validation": ...,
"test": ..., # these should be made the same way as the training set
},
targets=[...], # see below
dataset_type="OptimizationCycle", # choose one of the `DataTypeEnum`
seed=SEED, # from the first code snippet
)
# writes the schema where it's expected
dset.to_json_file("metadata.json")


Hopefully you can appreciate that the metadata is meant to lessen the burden
of future users of the dataset (including yourself!). The last thing to cover
here is that ``targets`` was omitted in the snippet above: this field is meant
for you to record every property that may or may not be part of the standard
``DataSampleSchema`` which is intended to be used throughout training. This is
the ``TargetSchema``: you must detail the name, expected shape, and a short
description of every property (including the standard ones). The main motivation
for this is that ``total_energy`` for one dataset may mean something very different
between one dataset to the next (electronic energy? thermodynamic corrections?),
and specifying this for the end user will remove any ambiguities.


.. autoclass:: matsciml.datasets.schema.TargetSchema
:members:
5 changes: 5 additions & 0 deletions entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
# this sources oneAPI components and silences the output so we don't
# have to see the wall of text every time we enter the container
source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1
exec "$@"
6 changes: 6 additions & 0 deletions matsciml/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from matsciml.datasets.ocp_datasets import IS2REDataset, S2EFDataset
from matsciml.datasets.oqmd import OQMDDataset
from matsciml.datasets.symmetry import SyntheticPointGroupDataset
from matsciml.datasets.schema import DatasetSchema, DataSampleSchema
from matsciml.datasets.generic import MatSciMLDataset, MatSciMLDataModule

__all__ = [
"AlexandriaDataset",
Expand All @@ -34,4 +36,8 @@
"SyntheticPointGroupDataset",
"MultiDataset",
"ColabFitDataset",
"DatasetSchema",
"DataSampleSchema",
"MatSciMLDataModule",
"MatSciMLDataset",
]
Loading

0 comments on commit dc6a125

Please sign in to comment.