-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #322 from laserkelvin/hdf5-datasets
Data schema and HDF5 data support
- Loading branch information
Showing
12 changed files
with
2,137 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
FROM amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly-xpu | ||
# for setup run with root | ||
USER 0 | ||
# needed to make sure mamba environment is activated | ||
ARG MAMBA_DOCKERFILE_ACTIVATE=1 | ||
# install packages, particularly xpu torch from nightly wheels | ||
RUN pip install torch-geometric | ||
# DGl is currently unsupported, and uses libtorch so we need to build it | ||
WORKDIR /opt/matsciml | ||
COPY . . | ||
RUN pip install -e . | ||
RUN git clone --recurse-submodules https://github.com/dmlc/dgl.git /opt/dgl | ||
ENV DGL_HOME=/opt/dgl | ||
WORKDIR /opt/dgl/build | ||
RUN cmake -DUSE_CUDA=OFF -DPython3_EXECUTABLE=/opt/conda/bin/python .. && make | ||
WORKDIR /opt/dgl/python | ||
RUN pip install . | ||
RUN micromamba clean --all --yes && rm -rf /opt/xpu-backend /var/lib/apt/lists/* | ||
# make conda read-writable for user | ||
RUN chown -R $MAMBA_USER:$MAMBA_USER /opt/matsciml && chown -R $MAMBA_USER:$MAMBA_USER /opt/conda | ||
# change back to non-root user | ||
USER $MAMBA_USER | ||
LABEL org.opencontainers.image.authors="Kin Long Kelvin Lee" | ||
LABEL org.opencontainers.image.vendor="Intel Labs" | ||
LABEL org.opencontainers.image.base.name="amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly" | ||
LABEL org.opencontainers.image.title="kinlongk-pytorch" | ||
LABEL org.opencontainers.image.description="XPU enabled PyTorch+Triton from Github artifact wheel builds." | ||
HEALTHCHECK NONE | ||
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
FROM mambaorg/micromamba:noble AS oneapi_xpu | ||
SHELL ["/bin/bash", "-c"] | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
ENV TZ=Etc/UTC | ||
# for setup run with root | ||
USER 0 | ||
# needed to make sure mamba environment is activated | ||
ARG MAMBA_DOCKERFILE_ACTIVATE=1 | ||
# VGID as an environment variable specifying the group ID for render on host | ||
# this needs to match, otherwise non-root users will not pick up cards | ||
ARG VGID=993 | ||
# install firmware, oneAPI, etc. | ||
RUN apt-get update -y && apt-get install -y software-properties-common wget git make g++ gcc gpg-agent | ||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | ||
| gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg | ||
RUN echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" > /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list | ||
RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \ | ||
| gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg | ||
RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu noble unified" > /etc/apt/sources.list.d/intel-gpu-noble.list | ||
RUN apt-get update -y && \ | ||
apt-get upgrade -y && \ | ||
apt-get install -y git make g++ gcc gpg-agent wget \ | ||
intel-for-pytorch-gpu-dev-0.5 \ | ||
intel-pti-dev \ | ||
cmake \ | ||
tzdata \ | ||
zlib1g zlib1g-dev \ | ||
xpu-smi \ | ||
intel-opencl-icd intel-level-zero-gpu libze1 intel-oneapi-mpi \ | ||
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ | ||
libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ | ||
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ | ||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo | ||
# make sure oneAPI components are in environment variables | ||
RUN source /opt/intel/oneapi/setvars.sh | ||
# make it so you don't have to source oneAPI every time | ||
COPY entrypoint.sh /usr/local/bin/entrypoint.sh | ||
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] | ||
FROM oneapi_xpu | ||
# set aliases so python and pip are met | ||
RUN ln -s /opt/conda/bin/python /usr/local/bin/python && ln -s /opt/conda/bin/pip /usr/local/bin/pip | ||
# clone matsciml into container and install | ||
RUN git clone https://github.com/IntelLabs/matsciml /opt/matsciml | ||
WORKDIR /opt/matsciml | ||
# install packages, particularly xpu torch from nightly wheels | ||
RUN micromamba install -y -n base -f conda.yml && \ | ||
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/xpu && \ | ||
pip install -e './[all]' | ||
RUN micromamba clean --all --yes && rm -rf /opt/xpu-backend /var/lib/apt/lists/* | ||
# let non-root mamba user have access to GPUS | ||
RUN groupadd -g $VGID render && usermod -a -G video,render $MAMBA_USER | ||
# make conda read-writable for user | ||
RUN chown -R $MAMBA_USER:$MAMBA_USER /opt/matsciml && chown -R $MAMBA_USER:$MAMBA_USER /opt/conda | ||
# change back to non-root user | ||
USER $MAMBA_USER | ||
LABEL org.opencontainers.image.authors="Kin Long Kelvin Lee" | ||
LABEL org.opencontainers.image.vendor="Intel Labs" | ||
LABEL org.opencontainers.image.base.name="amr-registry.caas.intel.com/aipg/kinlongk-pytorch:nightly" | ||
LABEL org.opencontainers.image.title="kinlongk-pytorch" | ||
LABEL org.opencontainers.image.description="XPU enabled PyTorch+Triton from Github artifact wheel builds." | ||
HEALTHCHECK NONE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
# this sources oneAPI components and silences the output so we don't | ||
# have to see the wall of text every time we enter the container | ||
source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1 | ||
exec "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ The Open MatSciML Toolkit | |
|
||
Getting started <self> | ||
datasets | ||
schema | ||
transforms | ||
models | ||
training | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
Schema | ||
========== | ||
|
||
The Open MatSciML Toolkit tries to place emphasis on reproducibility, and the general | ||
rule of "explicit is better than implicit" by defining schema for data and other development | ||
concepts. | ||
|
||
The intention is to move away from hardcoded ``Dataset`` classes that are rigid in | ||
that they require writing code, as well as not always reliably reproducible as the | ||
underlying data and frameworks change and evolve over time. Instead, the schema | ||
provided in ``matsciml`` tries to shift technical debt from maintaining code to | ||
**documenting** data, which assuming a thorough and complete description, should | ||
in principle be usable regardless of breaking API changes in frameworks that we rely | ||
on like ``pymatgen``, ``torch_geometric``, and so on. As a dataset is being packaged | ||
for distribution/defined, the schema should also make intentions of the developer clear | ||
to the end-user, e.g. what target label is available, how it was calculated, and so on, | ||
to help subsequent reproduction efforts. As an effect, this also makes development of | ||
``matsciml`` a lot more streamlined, as it then homogenizes field names (i.e. we can | ||
reliably expect ``cart_coords`` to be available and are cartesian coordinates). | ||
|
||
.. TIP:: | ||
You do not have to construct objects contained in schema if they are ``pydantic`` | ||
models themselves: for example, the ``PeriodicBoundarySchema`` is required in | ||
``DataSampleSchema``, but you can alternatively just pass a dictionary with the | ||
expected key/value mappings (i.e. ``{'x': True, 'y': True, 'z': False}``) for | ||
the relevant schema. | ||
|
||
|
||
Dataset schema reference | ||
######################## | ||
|
||
This schema lays out what can be described as metadata for a dataset. We define all of | ||
the expected fields in ``targets``, and record checksums for each dataset split such | ||
that we can record what model was trained on what specific split. Currently, it is the | ||
responsibility of the dataset distributor to record this metadata for their dataset, | ||
and package it as a ``metadata.json`` file in the same folder as the HDF5 files. | ||
|
||
.. autoclass:: matsciml.datasets.schema.DatasetSchema | ||
:members: | ||
|
||
Data sample schema reference | ||
############################ | ||
|
||
This schema comprises a **single** data sample, providing standardized field names for | ||
a host of commonly used properties. Most properties are optional for the class construction, | ||
but we highly recommend perusing the fields shown below to find the attribute closest to | ||
the property being recorded: ``pydantic`` does not allow arbitrary attributes to be stored | ||
in schema, but non-standard properties can be stashed away in ``extras`` as a dictionary of | ||
property name/values. | ||
|
||
.. autoclass:: matsciml.datasets.schema.DataSampleSchema | ||
:members: | ||
|
||
|
||
Creating datasets with schema | ||
############################# | ||
|
||
.. NOTE:: | ||
This section is primarily for people interested in developing new datasets. | ||
|
||
The premise behind defining these schema rigorously is to encourage reproducible workflows | ||
with (hopefully) less technical debt: we can safely rely on data to validate itself, | ||
catch mistakes when serializing/dumping data for others to use, and set reasonable expectations | ||
on what data will be available at what parts of training and evaluation. For those interested | ||
in creating their own datasets, this section lays out some preliminary notes on how to wrangle | ||
your data to adhere to schema, and make the data ready to be used by the pipeline. | ||
|
||
Matching your data to the schema | ||
================================ | ||
|
||
First step in dataset creation is taking whatever primary data format you have, and mapping | ||
them to the ``DataSampleSchema`` laid out above. The required keys include ``index``, ``cart_coords``, | ||
and so on, and by definition need to be provided. The code below shows an example loop | ||
over a list of data, which we convert into a dictionary with the same keys as expected in ``DataSampleSchema``: | ||
|
||
.. :: | ||
:caption: Example abstract code for mapping your data to the schema | ||
all_data = ... # should be a list of samples | ||
samples = [] | ||
for index, data in enumerate(all_data): | ||
temp_dict = {} | ||
temp_dict["cart_coords"] = data.positions | ||
temp_dict['index'] = index | ||
temp_dict['datatype'] = "OptimizationCycle" # must be one of the enums | ||
temp_dict['num_atoms'] = len(data.positions) | ||
schema = DataSampleSchema(**temp_dict) | ||
samples.append(schema) | ||
|
||
You end up with a list of ``DataSampleSchema`` which undergo all of the validation | ||
and consistency checks. | ||
|
||
Data splits | ||
====================== | ||
|
||
At this point you could call it a day, but if we want to create uniform random | ||
training and validation splits, this is a good point to do so. The code below | ||
shows one way of generating the splits: keep in mind that this mechanism for | ||
splitting might not be appropriate for your data - to mitigate data leakage, | ||
you may need to consider using more sophisticated algorithms that consider chemical | ||
elements, de-correlate dynamics, etc. Treat the code below as boilerplate, and | ||
modify it as needed. | ||
|
||
.. :: | ||
:caption: Example code showing how to generate training and validation splits | ||
import numpy as np | ||
import h5py | ||
from matsciml.datasets.generic import write_data_to_hdf5_group | ||
|
||
SEED = 73926 # this will be reused when generating the metadata | ||
rng = np.random.default_rng(SEED) | ||
|
||
all_indices = np.arange(len(samples)) | ||
val_split = int(len(samples) * 0.2) | ||
rng.shuffle(all_indices) | ||
train_indices = all_indices[val_split:] | ||
val_indices = all_indices[:val_split] | ||
|
||
# instantiate HDF5 files | ||
train_h5 = h5py.File("./train.h5", mode="w") | ||
|
||
for index in train_indices: | ||
sample = samples[index] | ||
# store each data sample as a group comprising array data | ||
group = train_h5.create_group(str(index)) | ||
# takes advantage of pydantic serialization | ||
for key, value in sample.model_dump(round_trip=True).items(): | ||
if value is not None: | ||
write_data_to_hdf5_group(key, value, group) | ||
|
||
|
||
Repeat the loop above for your validation set. | ||
|
||
Dataset metadata | ||
================== | ||
|
||
Once we have created these splits, there's a bunch of metadata associated | ||
with **how** we created the splits that we should record so that at runtime, | ||
there's no ambiguity which data and splits are being used and where they | ||
came from. | ||
|
||
.. :: | ||
from datetime import datetime | ||
from matsciml.datasets.generic import MatSciMLDataset | ||
from matsciml.datasets.schema import DatasetSchema | ||
|
||
# use the datasets we created above; `strict_checksum` needs to be | ||
# set the False here because we're going to be generating the checksum | ||
train_dset = MatSciMLDataset("./train.h5", strict_checksum=False) | ||
train_checksum = train_dset.blake2s_checksum | ||
|
||
# fill in the dataset metadata schema | ||
dset_schema = DatasetSchema( | ||
name="My new dataset", | ||
creation=datetime.now(), | ||
split_blake2s={ | ||
"train": train_checksum, | ||
"validation": ..., | ||
"test": ..., # these should be made the same way as the training set | ||
}, | ||
targets=[...], # see below | ||
dataset_type="OptimizationCycle", # choose one of the `DataTypeEnum` | ||
seed=SEED, # from the first code snippet | ||
) | ||
# writes the schema where it's expected | ||
dset.to_json_file("metadata.json") | ||
|
||
|
||
Hopefully you can appreciate that the metadata is meant to lessen the burden | ||
of future users of the dataset (including yourself!). The last thing to cover | ||
here is that ``targets`` was omitted in the snippet above: this field is meant | ||
for you to record every property that may or may not be part of the standard | ||
``DataSampleSchema`` which is intended to be used throughout training. This is | ||
the ``TargetSchema``: you must detail the name, expected shape, and a short | ||
description of every property (including the standard ones). The main motivation | ||
for this is that ``total_energy`` for one dataset may mean something very different | ||
between one dataset to the next (electronic energy? thermodynamic corrections?), | ||
and specifying this for the end user will remove any ambiguities. | ||
|
||
|
||
.. autoclass:: matsciml.datasets.schema.TargetSchema | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
# this sources oneAPI components and silences the output so we don't | ||
# have to see the wall of text every time we enter the container | ||
source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1 | ||
exec "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.