Skip to content

Commit

Permalink
Merge pull request #162 from ipcamit/kliff-master-v1
Browse files Browse the repository at this point in the history
Colabfit enhancments by Eric
  • Loading branch information
mjwen authored Feb 10, 2024
2 parents 6f9388b + 4ae331d commit 6741f52
Showing 1 changed file with 84 additions and 60 deletions.
144 changes: 84 additions & 60 deletions kliff/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from kliff.dataset.extxyz import read_extxyz, write_extxyz
from kliff.dataset.weight import Weight
from kliff.utils import stress_to_voigt, to_path
from kliff.utils import stress_to_tensor, stress_to_voigt, to_path

# For type checking
if TYPE_CHECKING:
Expand All @@ -24,6 +24,7 @@
MongoDatabase = None

import ase.io
from ase.data import chemical_symbols

# map from file_format to file extension
SUPPORTED_FORMAT = {"xyz": ".xyz"}
Expand Down Expand Up @@ -162,8 +163,7 @@ def to_file(self, filename: Path, file_format: str = "xyz"):
def from_colabfit(
cls,
database_client: "MongoDatabase",
configuration_id: str,
property_ids: Optional[Union[List[str], str]] = None,
data_object: dict,
weight: Optional[Weight] = None,
):
"""
Expand All @@ -172,46 +172,50 @@ def from_colabfit(
Args:
database_client: Instance of connected MongoDatabase client, which can be used to
fetch database from colabfit-tools dataset.
configuration_id: ID of the configuration instance to be collected from the collection
"configuration" in colabfit-tools.
property_ids: ID of the property instance to be associated with current configuration.
Usually properties would be trained against. Each associated property "field" will be
matched against provided list of aux_property_fields.
data_object: colabfit data object dictionary to be associated with current
configuration and property.
weight: an instance that computes the weight of the configuration in the loss
function.
"""
try:
fetched_configuration: "ColabfitConfiguration" = (
database_client.get_configuration(configuration_id)
configuration_id = data_object["relationships"][0]["configuration"]
fetched_configuration = database_client.configurations.find_one(
{"colabfit-id": data_object["relationships"][0]["configuration"]}
)
fetched_properties = list(
database_client.property_instances.find(
{
"colabfit-id": {
"$in": data_object["relationships"][0]["property_instance"]
}
}
)
)
except:
raise ConfigurationError(
"Looks like Mongo database did not return appropriate response. "
f"Please run db.configurations.find('_id':{configuration_id}) to verify response. "
f"Or try running the following in separate Python terminal:\n"
"from colabfit.tools.database import MongoDatabase\n"
f"client = MongoDatabase({database_client.database_name})\n"
f"client.get_configuration({configuration_id})\n"
" \n"
"Above shall return a Configuration object with ASE Atoms format.",
f"Please run db.configurations.find('_id':{data_object}) to verify response. "
)
coords = fetched_configuration.arrays["positions"]
species = fetched_configuration.get_chemical_symbols()
cell = np.array(fetched_configuration.cell.todict()["array"])
PBC = fetched_configuration.pbc

# get energy, forces, stresses from the property ids
energy = cls._get_colabfit_property(
database_client, property_ids, "energy", "potential-energy"
)
forces = cls._get_colabfit_property(
database_client, property_ids, "forces", "atomic-forces"
)
stress = cls._get_colabfit_property(
database_client, property_ids, "stress", "cauchy-stress"
)
stress = stress_to_voigt(stress)
cell = np.asarray(fetched_configuration["cell"])
# TODO: consistent Z -> symbol mapping -> Z mapping across all kliff
species = [
chemical_symbols[int(i)] for i in fetched_configuration["atomic_numbers"]
]
coords = np.asarray(fetched_configuration["positions"])
PBC = [bool(i) for i in fetched_configuration["pbc"]]

energy = None
forces = None
stress = None
for property in fetched_properties:
if property["type"] == "potential-energy":
energy = float(property["potential-energy"]["energy"]["source-value"])
elif property["type"] == "atomic-forces":
forces = np.asarray(property["atomic-forces"]["forces"]["source-value"])
elif property["type"] == "cauchy-stress":
stress = np.asarray(property["cauchy-stress"]["stress"]["source-value"])

stress = stress_to_voigt(stress)
self = cls(
cell,
species,
Expand All @@ -224,9 +228,7 @@ def from_colabfit(
weight=weight,
)
self.metadata = {
"database_client": database_client,
"property_id": property_ids,
"configuration_id": configuration_id,
"data_object": data_object,
}

return self
Expand Down Expand Up @@ -278,6 +280,27 @@ def from_ase_atoms(
)
return self

def to_ase_atoms(self):
"""
Convert the configuration to ase.Atoms object.
Returns:
ase.Atoms representation of the Configuration
"""
atoms = ase.Atoms(
symbols=self.species,
positions=self.coords,
cell=self.cell,
pbc=self.PBC,
)
if self.energy is not None:
atoms.info["energy"] = self.energy
if self.forces is not None:
atoms.set_array("forces", self.forces)
if self.stress is not None:
atoms.info["stress"] = stress_to_tensor(self.stress)
return atoms

@property
def cell(self) -> np.ndarray:
"""
Expand Down Expand Up @@ -541,18 +564,22 @@ class Dataset:
def __init__(self, configurations: Iterable = None):
if configurations is None:
self.configs = []
elif isinstance(configurations, Iterable):
elif isinstance(configurations, Iterable) and not isinstance(
configurations, str
):
self.configs = list(configurations)
else:
raise DatasetError(
"configurations must be a iterable of Configuration objects."
)

@classmethod
@requires(MongoDatabase is not None, "colabfit-tools is not installed")
def from_colabfit(
cls,
colabfit_database: str,
colabfit_dataset: str,
colabfit_uri: str = "mongodb://localhost:27017",
weight: Optional[Weight] = None,
) -> "Dataset":
"""
Expand All @@ -562,18 +589,25 @@ def from_colabfit(
weight: an instance that computes the weight of the configuration in the loss
function.
colabfit_database: Name of the colabfit Mongo database to read from.
colabfit_dataset: Name of the colabfit dataset instance to read from.
colabfit_dataset: Name of the colabfit dataset instance to read from, usually
it is of form, e.g., "DS_xxxxxxxxxxxx_0"
colabfit_uri: connection URI of the colabfit Mongo database to read from.
Returns:
A dataset of configurations.
"""
instance = cls()
instance.add_from_colabfit(colabfit_database, colabfit_dataset, weight)
instance.add_from_colabfit(
colabfit_database, colabfit_dataset, colabfit_uri, weight
)
return instance

@staticmethod
@requires(MongoDatabase is not None, "colabfit-tools is not installed")
def _read_from_colabfit(
database_client, colabfit_dataset, weight
database_client: MongoDatabase,
colabfit_dataset: str,
weight: Optional[Weight] = None,
) -> List[Configuration]:
"""
Read configurations from colabfit database.
Expand All @@ -589,30 +623,18 @@ def _read_from_colabfit(
A list of configurations.
"""
# get configuration and property ID and send it to load configuration-first get Data Objects
dataset_dos = database_client.get_data(
"data_objects",
fields=["colabfit-id"],
query={"relationships.datasets": {"$in": [colabfit_dataset]}},
data_objects = database_client.data_objects.find(
{"relationships.dataset": colabfit_dataset}
)
if not dataset_dos:
if not data_objects:
logger.error(f"{colabfit_dataset} is either empty or does not exist")
raise DatasetError(f"{colabfit_dataset} is either empty or does not exist")

configs = []
for do in dataset_dos:
co_doc = database_client.configurations.find_one(
{"relationships.data_objects": {"$in": [do]}}
)
pi_doc = database_client.property_instances.find(
{"relationships.data_objects": {"$in": [do]}}
)
co_id = co_doc["colabfit-id"]
pi_ids = [i["colabfit-id"] for i in pi_doc]

for data_object in data_objects:
configs.append(
Configuration.from_colabfit(database_client, co_id, pi_ids, weight)
Configuration.from_colabfit(database_client, data_object, weight)
)
# TODO: reduce number of queries to database. Current: 4 per configuration

if len(configs) <= 0:
raise DatasetError(f"No dataset file with in {colabfit_dataset} dataset.")
Expand All @@ -626,21 +648,23 @@ def add_from_colabfit(
self,
colabfit_database: str,
colabfit_dataset: str,
colabfit_uri: str = "mongodb://localhost:27017",
weight: Optional[Weight] = None,
):
"""
Read configurations from colabfit database and add them to the dataset.
Args:
colabfit_database: Name of the colabfit Mongo database to read from.
colabfit_dataset: Name of the colabfit dataset instance to read from.
colabfit_dataset: Name of the colabfit dataset instance to read from (usually
it is of form, e.g., "DS_xxxxxxxxxxxx_0")
colabfit_uri: connection URI of the colabfit Mongo database to read from.
weight: an instance that computes the weight of the configuration in the loss
function.
"""
# open link to the mongo
mongo_client = MongoDatabase(colabfit_database)
colabfit_dataset = colabfit_dataset
mongo_client = MongoDatabase(colabfit_database, uri=colabfit_uri)
configs = Dataset._read_from_colabfit(mongo_client, colabfit_dataset, weight)
self.configs.extend(configs)

Expand Down

0 comments on commit 6741f52

Please sign in to comment.