Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add OPTIMADE adapter #3876

Merged
merged 4 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 194 additions & 0 deletions src/pymatgen/io/optimade.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
"""
This module provides conversion between structure entries following the
OPTIMADE (https://optimade.org) standard and pymatgen Structure objects.

The code is adapted from the `optimade.adapters.structures.pymatgen` module in
optimade-python-tools (https://github.com/Materials-Consortia/optimade-python-tools),
and aims to work without requiring the explicit installation of the `optimade-python-tools`.

"""

from __future__ import annotations

import itertools
import json
import math
import re
from functools import reduce
from typing import TYPE_CHECKING

from pymatgen.core.structure import Lattice, Structure

if TYPE_CHECKING:
from collections.abc import Generator
from typing import Any


__author__ = "Matthew Evans"


def _pymatgen_species(
nsites: int,
species_at_sites: list[str],
) -> list[dict[str, float]]:
"""Create list of {"symbol": "concentration"} per site for constructing pymatgen Species objects.
Removes vacancies, if they are present.

This function is adapted from the `optimade.adapters.structures.pymatgen` module in `optimade-python-tools`,
with some of the generality removed (in terms of partial occupancy).

"""
species = [{"name": _, "concentration": [1.0], "chemical_symbols": [_]} for _ in set(species_at_sites)]
species_dict = {_["name"]: _ for _ in species}

pymatgen_species = []
for site_number in range(nsites):
species_name = species_at_sites[site_number]
current_species = species_dict[species_name]

chemical_symbols = []
concentration = []
for index, symbol in enumerate(current_species["chemical_symbols"]):
if symbol == "vacancy":
# Skip. This is how pymatgen handles vacancies;
# to not include them, while keeping the concentration in a site less than 1.
continue
chemical_symbols.append(symbol)
concentration.append(current_species["concentration"][index])

pymatgen_species.append(dict(zip(chemical_symbols, concentration)))

return pymatgen_species


def _optimade_anonymous_element_generator() -> Generator[str, None, None]:
"""Generator that yields the next symbol in the A, B, Aa, ... Az OPTIMADE anonymous
element naming scheme.

"""
from string import ascii_lowercase

for size in itertools.count(1):
for tuple_strings in itertools.product(ascii_lowercase, repeat=size):
list_strings = list(tuple_strings)
list_strings[0] = list_strings[0].upper()
yield "".join(list_strings)


def _optimade_reduce_or_anonymize_formula(formula: str, alphabetize: bool = True, anonymize: bool = False) -> str:
"""Takes an input formula, reduces it and either alphabetizes or anonymizes it
following the OPTIMADE standard.

"""

numbers: list[int] = [int(n.strip() or 1) for n in re.split(r"[A-Z][a-z]*", formula)[1:]]
# Need to remove leading 1 from split and convert to ints

species: list[str] = re.findall("[A-Z][a-z]*", formula)

gcd = reduce(math.gcd, numbers)

if not len(species) == len(numbers):
raise ValueError(f"Something is wrong with the input formula: {formula}")

numbers = [n // gcd for n in numbers]

if anonymize:
numbers = sorted(numbers, reverse=True)
species = [s for _, s in zip(numbers, _optimade_anonymous_element_generator())]

elif alphabetize:
species, numbers = zip(*sorted(zip(species, numbers))) # type: ignore[assignment]

return "".join(f"{s}{n if n != 1 else ''}" for n, s in zip(numbers, species))


class OptimadeStructureAdapter:
"""Adapter serves as a bridge between OPTIMADE structures and pymatgen objects."""

@staticmethod
def get_optimade_structure(structure: Structure, **kwargs) -> dict[str, str | dict[str, Any]]:
"""Get a dictionary in the OPTIMADE Structure format from a pymatgen structure or molecule.

Args:
structure (Structure): pymatgen Structure
**kwargs: passed to the ASE Atoms constructor

Returns:
A dictionary serialization of the structure in the OPTIMADE format.

"""
if not structure.is_ordered:
raise ValueError("OPTIMADE Adapter currently only supports ordered structures")

attributes: dict[str, Any] = {}
attributes["cartesian_site_positions"] = structure.lattice.get_cartesian_coords(structure.frac_coords).tolist()
attributes["lattice_vectors"] = structure.lattice.matrix.tolist()
attributes["species_at_sites"] = [_.symbol for _ in structure.species]
attributes["species"] = [
{"name": _.symbol, "chemical_symbols": [_.symbol], "concentration": [1]}
for _ in set(structure.composition.elements)
]
attributes["dimension_types"] = [int(_) for _ in structure.lattice.pbc]
attributes["nperiodic_dimensions"] = sum(attributes["dimension_types"])
attributes["nelements"] = len(structure.composition.elements)
attributes["chemical_formula_anonymous"] = _optimade_reduce_or_anonymize_formula(
structure.composition.formula, anonymize=True
)
attributes["elements"] = sorted([_.symbol for _ in structure.composition.elements])
attributes["chemical_formula_reduced"] = _optimade_reduce_or_anonymize_formula(
structure.composition.formula, anonymize=False
)
attributes["chemical_formula_descriptive"] = structure.composition.formula
attributes["elements_ratios"] = [structure.composition.get_atomic_fraction(e) for e in attributes["elements"]]
attributes["nsites"] = len(attributes["species_at_sites"])

attributes["last_modified"] = None
attributes["immutable_id"] = None
attributes["structure_features"] = []

return {"attributes": attributes}

@staticmethod
def get_structure(resource: dict) -> Structure:
"""Get pymatgen structure from an OPTIMADE structure resource.

Args:
resource: OPTIMADE structure resource as a dictionary, JSON string, or the
corresponding attributes dictionary (i.e., `resource["attributes"]`).

Returns:
Structure: Equivalent pymatgen Structure

"""
if isinstance(resource, str):
try:
resource = json.loads(resource)
except json.JSONDecodeError as exc:
raise ValueError(f"Could not decode the input OPTIMADE resource as JSON: {exc}")

if "attributes" not in resource:
resource = {"attributes": resource}

_id = resource.get("id", None)
attributes = resource["attributes"]
properties: dict[str, Any] = {"optimade_id": _id}

# Take any prefixed attributes and save them as properties
custom_properties = {k: v for k, v in attributes.items() if k.startswith("_")}
if custom_properties:
properties["optimade_attributes"] = custom_properties

return Structure(
lattice=Lattice(
attributes["lattice_vectors"],
[bool(d) for d in attributes["dimension_types"]], # type: ignore[arg-type]
),
species=_pymatgen_species(
nsites=attributes["nsites"],
species_at_sites=attributes["species_at_sites"],
),
coords=attributes["cartesian_site_positions"],
coords_are_cartesian=True,
properties=properties,
)
37 changes: 37 additions & 0 deletions tests/io/test_optimade.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from __future__ import annotations

import numpy as np

from pymatgen.core import Structure
from pymatgen.io.optimade import OptimadeStructureAdapter
from pymatgen.util.testing import TEST_FILES_DIR, VASP_IN_DIR

STRUCTURE = Structure.from_file(f"{VASP_IN_DIR}/POSCAR")
XYZ_STRUCTURE = f"{TEST_FILES_DIR}/io/xyz/acetylene.xyz"


def test_get_optimade_structure_roundtrip():
optimade_structure = OptimadeStructureAdapter.get_optimade_structure(STRUCTURE)

assert optimade_structure["attributes"]["nsites"] == len(STRUCTURE)
assert optimade_structure["attributes"]["elements"] == ["Fe", "O", "P"]
assert optimade_structure["attributes"]["nelements"] == 3
assert optimade_structure["attributes"]["chemical_formula_reduced"] == "FeO4P"
assert optimade_structure["attributes"]["species_at_sites"] == 4 * ["Fe"] + 4 * ["P"] + 16 * ["O"]
np.testing.assert_array_almost_equal(
np.abs(optimade_structure["attributes"]["lattice_vectors"]), np.abs(STRUCTURE.lattice.matrix)
)

# Set an OPTIMADE ID and some custom properties and ensure they are preserved in the properties
test_id = "test_id"
optimade_structure["id"] = test_id
custom_properties = {"_custom_field": "test_custom_field", "_custom_band_gap": 2.2}
optimade_structure["attributes"].update(custom_properties)

roundtrip_structure = OptimadeStructureAdapter.get_structure(optimade_structure)
assert roundtrip_structure.properties["optimade_id"] == test_id
assert roundtrip_structure.properties["optimade_attributes"] == custom_properties

# Delete the properties before the check for equality
roundtrip_structure.properties = {}
assert roundtrip_structure == STRUCTURE