diff --git a/_config.yml b/_config.yml deleted file mode 100644 index c419263..0000000 --- a/_config.yml +++ /dev/null @@ -1 +0,0 @@ -theme: jekyll-theme-cayman \ No newline at end of file diff --git a/ecnet/__init__.py b/ecnet/__init__.py index ce53add..017f38b 100644 --- a/ecnet/__init__.py +++ b/ecnet/__init__.py @@ -1 +1,2 @@ from .model import ECNet +__version__ = '4.1.0' diff --git a/ecnet/model.py b/ecnet/model.py index dd1d244..a766419 100644 --- a/ecnet/model.py +++ b/ecnet/model.py @@ -61,7 +61,8 @@ def fit(self, smiles: List[str] = None, target_vals: List[List[float]] = None, dataset: QSPRDataset = None, backend: str = 'padel', batch_size: int = 32, epochs: int = 100, lr_decay: float = 0.0, valid_size: float = 0.0, valid_eval_iter: int = 1, patience: int = 16, verbose: int = 0, - random_state: int = None, **kwargs) -> Tuple[List[float], List[float]]: + random_state: int = None, shuffle: bool = False, + **kwargs) -> Tuple[List[float], List[float]]: """ fit: fits ECNet to either (1) SMILES and target values, or (2) a pre-loaded QSPRDataset; the training process utilizes the Adam optimization algorithm, MSE loss, ReLU activation @@ -90,6 +91,8 @@ def fit(self, smiles: List[str] = None, target_vals: List[List[float]] = None, verbose (int, optional): if > 0, will print every `this` epochs; default = 0 random_state (int, optional): random_state used by sklearn.model_selection. train_test_split; default = None + shuffle (bool, optional): if True, shuffles training/validation data between epochs; + default = False; random_state should be None **kwargs: arguments accepted by torch.optim.Adam (i.e. learning rate, beta values) Returns: @@ -136,6 +139,18 @@ def fit(self, smiles: List[str] = None, target_vals: List[List[float]] = None, if not CBO.on_epoch_begin(epoch): break + if shuffle: + index_train, index_valid = train_test_split( + [i for i in range(len(dataset))], test_size=valid_size, + random_state=random_state + ) + dataloader_train = DataLoader( + Subset(dataset, index_train), batch_size=batch_size, shuffle=True + ) + dataloader_valid = DataLoader( + Subset(dataset, index_valid), batch_size=len(index_valid), shuffle=True + ) + train_loss = 0.0 self.train() diff --git a/ecnet/tasks/__init__.py b/ecnet/tasks/__init__.py index 2a1233d..de6541f 100644 --- a/ecnet/tasks/__init__.py +++ b/ecnet/tasks/__init__.py @@ -1,2 +1,3 @@ from .feature_selection import select_rfr -from .parameter_tuning import tune_batch_size, tune_model_architecture, tune_training_parameters +from .parameter_tuning import N_TESTS, CONFIG, tune_batch_size, tune_model_architecture,\ + tune_training_parameters diff --git a/ecnet/tasks/parameter_tuning.py b/ecnet/tasks/parameter_tuning.py index e8365cc..266c981 100644 --- a/ecnet/tasks/parameter_tuning.py +++ b/ecnet/tasks/parameter_tuning.py @@ -1,17 +1,19 @@ from ecabc import ABC from sklearn.metrics import median_absolute_error from copy import deepcopy +import numpy as np from ..model import ECNet from ..datasets.structs import QSPRDataset from typing import Iterable +N_TESTS = 10 CONFIG = { 'training_params_range': { - 'lr': (0.0, 0.05), - 'lr_decay': (0.0, 0.0001) + 'lr': (1e-16, 0.05), + 'lr_decay': (1e-16, 0.0001) }, 'architecture_params_range': { 'hidden_dim': (1, 1024), @@ -38,6 +40,8 @@ def _get_kwargs(**kwargs): 'eval_ds': kwargs.get('eval_ds'), 'epochs': kwargs.get('epochs', 100), 'batch_size': kwargs.get('batch_size', 32), + 'valid_size': kwargs.get('valid_size', 0.2), + 'patience': kwargs.get('patience', 32), 'lr_decay': kwargs.get('lr_decay', 0.0), 'lr': kwargs.get('lr', 0.001), 'beta_1': kwargs.get('beta_1', 0.9), @@ -54,6 +58,8 @@ def _get_kwargs(**kwargs): def _evaluate_model(trial_spec: dict) -> float: """ Training sub-function for cost functions _cost_batch_size, _cost_arch, _cost_train_hp; + Each model configuration is tested ecnet.tasks.parameter_tuning.N_TESTS times, average + median absolute error across all tests returned; default 10 tests per configuration Args: trial_spec (dict): all relevant parameters for this training trial @@ -62,25 +68,32 @@ def _evaluate_model(trial_spec: dict) -> float: float: median absolute error for dataset being evaluated (trial_spec['eval_ds']) """ - model = deepcopy(trial_spec['model']) - model._hidden_dim = trial_spec['hidden_dim'] - model._n_hidden = trial_spec['n_hidden'] - model._dropout = trial_spec['dropout'] - model._construct() - model.fit( - dataset=trial_spec['train_ds'], - epochs=trial_spec['epochs'], - batch_size=trial_spec['batch_size'], - lr_decay=trial_spec['lr_decay'], - lr=trial_spec['lr_decay'], - betas=(trial_spec['beta_1'], trial_spec['beta_2']), - eps=trial_spec['eps'], - weight_decay=trial_spec['weight_decay'], - amsgrad=trial_spec['amsgrad'] + model = ECNet( + trial_spec['train_ds'].desc_vals.shape[1], + trial_spec['train_ds'].target_vals.shape[1], + trial_spec['hidden_dim'], + trial_spec['n_hidden'], + trial_spec['dropout'] ) - yhat_eval = model(trial_spec['eval_ds'].desc_vals).detach().numpy() - y_eval = trial_spec['eval_ds'].target_vals - return median_absolute_error(y_eval, yhat_eval) + maes = [] + for _ in range(N_TESTS): + model._construct() + model.fit( + dataset=trial_spec['train_ds'], + epochs=trial_spec['epochs'], + batch_size=trial_spec['batch_size'], + patience=trial_spec['patience'], + lr_decay=trial_spec['lr_decay'], + lr=trial_spec['lr_decay'], + betas=(trial_spec['beta_1'], trial_spec['beta_2']), + eps=trial_spec['eps'], + weight_decay=trial_spec['weight_decay'], + amsgrad=trial_spec['amsgrad'] + ) + yhat_eval = model(trial_spec['eval_ds'].desc_vals).detach().numpy() + y_eval = trial_spec['eval_ds'].target_vals + maes.append(median_absolute_error(y_eval, yhat_eval)) + return np.mean(maes) def _cost_batch_size(vals: Iterable[float], **kwargs) -> float: @@ -100,20 +113,43 @@ def _cost_batch_size(vals: Iterable[float], **kwargs) -> float: return _evaluate_model(trial_spec) -def tune_batch_size(n_bees: int, n_iter: int, n_processes: int = 1, **kwargs) -> dict: +def tune_batch_size(n_bees: int, n_iter: int, dataset_train: QSPRDataset, + dataset_eval: QSPRDataset, n_processes: int = 1, + **kwargs) -> dict: """ - Tunes the batch size during training + Tunes the batch size during training; additional **kwargs can include any in: + [ + # ECNet parameters + 'epochs' (default 100), + 'valid_size' (default 0.2), + 'patience' (default 32), + 'lr_decay' (default 0.0), + 'hidden_dim' (default 128), + 'n_hidden' (default 2), + 'dropout': (default 0.0), + # Adam optim. alg. arguments + 'lr' (default 0.001), + 'beta_1' (default 0.9), + 'beta_2' (default 0.999), + 'eps' (default 1e-8), + 'weight_decay' (default 0.0), + 'amsgrad' (default False) + ] Args: n_bees (int): number of employer bees to use in ABC algorithm n_iter (int): number of iterations, or "search cycles", for ABC algorithm - n_processes (int): if > 1, uses multiprocessing when evaluating at an iteration - **kwargs: arguments passed to _cost_batch_size + dataset_train (QSPRDataset): dataset used to train evaluation models + dataset_eval (QSPRDataset): dataset used for evaluation + n_processes (int, optional): if > 1, uses multiprocessing when evaluating at an iteration + **kwargs: additional arguments Returns: - dict: {'batch_size': tuned batch size} + dict: {'batch_size': int} """ + kwargs['train_ds'] = dataset_train + kwargs['eval_ds'] = dataset_eval abc = ABC(n_bees, _cost_batch_size, num_processes=n_processes, obj_fn_args=kwargs) abc.add_param(1, len(kwargs.get('train_ds').desc_vals), name='batch_size') abc.initialize() @@ -144,20 +180,42 @@ def _cost_arch(vals, **kwargs): return _evaluate_model(trial_spec) -def tune_model_architecture(n_bees: int, n_iter: int, n_processes: int = 1, **kwargs) -> dict: +def tune_model_architecture(n_bees: int, n_iter: int, dataset_train: QSPRDataset, + dataset_eval: QSPRDataset, n_processes: int = 1, + **kwargs) -> dict: """ - Tunes the NN's architecture + Tunes model architecture parameters (number of hidden layers, neurons per hidden layer, neuron + dropout); additional **kwargs can include any in: + [ + # ECNet parameters + 'epochs' (default 100), + 'batch_size' (default 32), + 'valid_size' (default 0.2), + 'patience' (default 32), + 'lr_decay' (default 0.0), + # Adam optim. alg. arguments + 'lr' (default 0.001), + 'beta_1' (default 0.9), + 'beta_2' (default 0.999), + 'eps' (default 1e-8), + 'weight_decay' (default 0.0), + 'amsgrad' (default False) + ] Args: n_bees (int): number of employer bees to use in ABC algorithm n_iter (int): number of iterations, or "search cycles", for ABC algorithm - n_processes (int): if > 1, uses multiprocessing when evaluating at an iteration - **kwargs: arguments passed to _cost_batch_size + dataset_train (QSPRDataset): dataset used to train evaluation models + dataset_eval (QSPRDataset): dataset used for evaluation + n_processes (int, optional): if > 1, uses multiprocessing when evaluating at an iteration + **kwargs: additional arguments Returns: - dict: {'batch_size': opt_val, 'n_hidden': opt_val, 'dropout': opt_val} + dict: {'hidden_dim': int, 'n_hidden': int, 'dropout': float} """ + kwargs['train_ds'] = dataset_train + kwargs['eval_ds'] = dataset_eval abc = ABC(n_bees, _cost_arch, num_processes=n_processes, obj_fn_args=kwargs) abc.add_param(CONFIG['architecture_params_range']['hidden_dim'][0], CONFIG['architecture_params_range']['hidden_dim'][1], name='hidden_dim') @@ -195,20 +253,42 @@ def _cost_train_hp(vals, **kwargs): return _evaluate_model(trial_spec) -def tune_training_parameters(n_bees: int, n_iter: int, n_processes: int = 1, **kwargs) -> dict: +def tune_training_parameters(n_bees: int, n_iter: int, dataset_train: QSPRDataset, + dataset_eval: QSPRDataset, n_processes: int = 1, + **kwargs) -> dict: """ - Tunes the NN's training parameters (Adam optim. fn.) + Tunes learning rate, learning rate decay; additional **kwargs can include any in: + [ + # ECNet parameters + 'epochs' (default 100), + 'batch_size' (default 32), + 'valid_size' (default 0.2), + 'patience' (default 32), + 'hidden_dim' (default 128), + 'n_hidden' (default 2), + 'dropout': (default 0.0), + # Adam optim. alg. arguments + 'beta_1' (default 0.9), + 'beta_2' (default 0.999), + 'eps' (default 1e-8), + 'weight_decay' (default 0.0), + 'amsgrad' (default False) + ] Args: n_bees (int): number of employer bees to use in ABC algorithm n_iter (int): number of iterations, or "search cycles", for ABC algorithm - n_processes (int): if > 1, uses multiprocessing when evaluating at an iteration - **kwargs: arguments passed to _cost_batch_size + dataset_train (QSPRDataset): dataset used to train evaluation models + dataset_eval (QSPRDataset): dataset used for evaluation + n_processes (int, optional): if > 1, uses multiprocessing when evaluating at an iteration + **kwargs: additional arguments Returns: - dict: {'lr': opt_val, 'lr_decay': opt_val} + dict: {'lr': float, 'lr_decay': float} """ + kwargs['train_ds'] = dataset_train + kwargs['eval_ds'] = dataset_eval abc = ABC(n_bees, _cost_train_hp, num_processes=n_processes, obj_fn_args=kwargs) abc.add_param(CONFIG['training_params_range']['lr'][0], CONFIG['training_params_range']['lr'][1], name='lr') diff --git a/examples/getting_started.ipynb b/examples/getting_started.ipynb new file mode 100644 index 0000000..866fb07 --- /dev/null +++ b/examples/getting_started.ipynb @@ -0,0 +1,372 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.10 64-bit ('ecnet': conda)" + }, + "interpreter": { + "hash": "b8ddbdeb4e8d258564393fa38a886a93a7bbb414136361bc7c3a5a1c29ceff9e" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Number of samples: 43\nNumber of QSPR descriptors per sample: 1875\n" + ] + } + ], + "source": [ + "# First, let's load our experimental cloud point data; we're using PaDEL-Descriptor to generate QSPR descriptors\n", + "\n", + "from ecnet.datasets import load_cp\n", + "\n", + "dataset = load_cp(as_dataset=True, backend='padel')\n", + "\n", + "print(f'Number of samples: {dataset.desc_vals.shape[0]}')\n", + "print(f'Number of QSPR descriptors per sample: {dataset.desc_vals.shape[1]}')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Number of samples in the training set: 34\nNumber of samples in the testing set: 9\n" + ] + } + ], + "source": [ + "# Now we create training and testing data subsets; our ANNs regress directly on the training data, and the test set is used to measure blind prediction accuracy\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from copy import deepcopy\n", + "\n", + "index_train, index_test = train_test_split([i for i in range(len(dataset))], test_size=0.2, random_state=24)\n", + "\n", + "dataset_train = deepcopy(dataset)\n", + "dataset_train.set_index(index_train)\n", + "\n", + "dataset_test = deepcopy(dataset)\n", + "dataset_test.set_index(index_test)\n", + "\n", + "print(f'Number of samples in the training set: {len(dataset_train)}')\n", + "print(f'Number of samples in the testing set: {len(dataset_test)}')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\n\n\n \n \n \n \n 2021-06-30T12:35:48.301674\n image/svg+xml\n \n \n Matplotlib v3.4.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "# Many QSPR descriptors are not important when predicting cloud point for a database of hydrocarbons and oxygenated compounds; for example, the descriptor counting the number of nitrogen atoms will be zero for all compounds. We will select the descriptors with the highest correlation to cloud point for use as ANN inputs, such that 95% of total correlation (derived from random forest regression) is retained:\n", + "\n", + "from ecnet.tasks.feature_selection import select_rfr\n", + "from matplotlib import pyplot as plt\n", + "\n", + "# Note: we select based on the training set, we want the test set to be 100% blind\n", + "desc_idx, desc_imp = select_rfr(dataset_train, total_importance=0.95, n_estimators=50)\n", + "\n", + "dataset_train.set_desc_index(desc_idx)\n", + "dataset_test.set_desc_index(desc_idx)\n", + "\n", + "# Let's graph importance (individual and cumulative sum) for the selected descriptors:\n", + "rank = [i for i in range(len(desc_imp))]\n", + "tot_imp = [0.0]\n", + "for imp in desc_imp:\n", + " tot_imp.append(tot_imp[-1] + imp)\n", + "tot_imp = tot_imp[1:]\n", + "\n", + "plt.clf()\n", + "fig, ax = plt.subplots(constrained_layout=True)\n", + "ax.set_xlabel('Selected descriptor (most-to-least important)')\n", + "ax.set_ylabel('Descriptor importance')\n", + "ax.plot(rank, desc_imp, color='red')\n", + "ax2 = ax.twinx()\n", + "ax2.set_ylabel('Cumulative importance')\n", + "ax2.plot(rank, tot_imp, color='blue')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\n\n\n \n \n \n \n 2021-06-30T12:35:48.486805\n image/svg+xml\n \n \n Matplotlib v3.4.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "# We observe that there are only a handful of QSPR descriptors with significant correlation to cloud point; let's visualize the relationship between kinematic viscosity and the descriptor with the highest importance:\n", + "\n", + "ysi = [dataset_train.target_vals[i][0] for i in range(len(dataset_train))]\n", + "top_desc = [dataset_train.desc_vals[i][0] for i in range(len(dataset_train))]\n", + "\n", + "plt.clf()\n", + "plt.xlabel(f'{dataset_train.desc_names[0]} value')\n", + "plt.ylabel('Experimental cloud point value (deg. C)')\n", + "plt.scatter(top_desc, ysi, color='blue')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch: 0 | Train loss: 3019.848388671875 | Valid loss: 9223372036854775807\n", + "Epoch: 10 | Train loss: 301.50048828125 | Valid loss: 571.7460327148438\n", + "Epoch: 20 | Train loss: 351.8019104003906 | Valid loss: 800.0086669921875\n", + "Epoch: 30 | Train loss: 330.5296325683594 | Valid loss: 539.5159301757812\n", + "Epoch: 40 | Train loss: 264.02142333984375 | Valid loss: 315.75274658203125\n", + "Epoch: 50 | Train loss: 198.6739501953125 | Valid loss: 249.75222778320312\n", + "Epoch: 60 | Train loss: 87.38817596435547 | Valid loss: 125.52989196777344\n", + "Epoch: 70 | Train loss: 59.61891555786133 | Valid loss: 100.54280090332031\n", + "Epoch: 80 | Train loss: 43.34950256347656 | Valid loss: 78.16502380371094\n", + "Epoch: 90 | Train loss: 60.24040985107422 | Valid loss: 91.14484405517578\n" + ] + } + ], + "source": [ + "# Enough exploration, let's train an ANN to predict kinematic viscosity:\n", + "\n", + "from ecnet import ECNet\n", + "\n", + "# Create an ANN with `n` input neurons (where `n` == number of selected QSPR descriptors), 2 hidden layers with 256 neurons each, and one output neuron (corresponding to yield sooting index)\n", + "model = ECNet(dataset_train.desc_vals.shape[1], 1, 256, 2)\n", + "# arguments follow [input dim, output dim, hidden dim, n hidden]\n", + "\n", + "# Train the ANN using training dataset, with a random 20% of the dataset used for validation every epoch:\n", + "train_loss, valid_loss = model.fit(\n", + " dataset=dataset_train, valid_size=0.2, verbose=10,\n", + " patience=32, epochs=300, random_state=None, shuffle=True,\n", + " lr=0.005\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\n\n\n \n \n \n \n 2021-06-30T12:35:48.955909\n image/svg+xml\n \n \n Matplotlib v3.4.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "# We can visualize the training set's loss over training, as well as the validation subset's loss:\n", + "\n", + "from math import sqrt\n", + "\n", + "train_loss = [sqrt(l) for l in train_loss][5:]\n", + "valid_loss = [sqrt(l) for l in valid_loss][5:]\n", + "epoch = [i for i in range(len(train_loss))]\n", + "plt.clf()\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Sqrt(Loss)')\n", + "plt.plot(epoch, train_loss, color='blue', label='Training Loss')\n", + "plt.plot(epoch, valid_loss, color='red', label='Validation Loss')\n", + "plt.legend(loc='upper right')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Training median absolute error: 4.776594161987305\nTraining r-squared coefficient: 0.8675852185693186\nTesting median absolute error: 9.086380004882812\nTesting r-squared coefficient: 0.8877325994065388\n" + ] + } + ], + "source": [ + "# Let's calculate median absolute error and r-squared coefficient for each dataset:\n", + "\n", + "from sklearn.metrics import median_absolute_error, r2_score\n", + "\n", + "y_hat_train = model(dataset_train.desc_vals).detach().numpy()\n", + "y_train = dataset_train.target_vals\n", + "train_mae = median_absolute_error(y_hat_train, y_train)\n", + "train_r2 = r2_score(y_hat_train, y_train)\n", + "y_hat_test = model(dataset_test.desc_vals).detach().numpy()\n", + "y_test = dataset_test.target_vals\n", + "test_mae = median_absolute_error(y_hat_test, y_test)\n", + "test_r2 = r2_score(y_hat_test, y_test)\n", + "print(f'Training median absolute error: {train_mae}')\n", + "print(f'Training r-squared coefficient: {train_r2}')\n", + "print(f'Testing median absolute error: {test_mae}')\n", + "print(f'Testing r-squared coefficient: {test_r2}')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\n\n\n \n \n \n \n 2021-06-30T12:35:49.142649\n image/svg+xml\n \n \n Matplotlib v3.4.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "# Now we can visually compare predicted values to experimental values:\n", + "\n", + "plt.clf()\n", + "plt.xlabel('Experimental CP Value (deg. C)')\n", + "plt.ylabel('Predicted CP Value (deg. C)')\n", + "plt.scatter(y_train, y_hat_train, color='blue', label='Training Set')\n", + "plt.scatter(y_test, y_hat_test, color='red', label='Testing Set')\n", + "plt.legend(loc='upper left')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's save our model for later use:\n", + "\n", + "model.save('cp_model.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Training median absolute error: 4.776594161987305\nTraining r-squared coefficient: 0.8675852185693186\nTesting median absolute error: 9.086380004882812\nTesting r-squared coefficient: 0.8877325994065388\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\n\n\n \n \n \n \n 2021-06-30T12:35:49.338191\n image/svg+xml\n \n \n Matplotlib v3.4.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "# And test to make sure we can recall it:\n", + "\n", + "from ecnet.model import load_model\n", + "\n", + "model_2 = load_model('cp_model.pt')\n", + "\n", + "y_hat_train = model_2(dataset_train.desc_vals).detach().numpy()\n", + "y_train = dataset_train.target_vals\n", + "train_mae = median_absolute_error(y_hat_train, y_train)\n", + "train_r2 = r2_score(y_hat_train, y_train)\n", + "y_hat_test = model_2(dataset_test.desc_vals).detach().numpy()\n", + "y_test = dataset_test.target_vals\n", + "test_mae = median_absolute_error(y_hat_test, y_test)\n", + "test_r2 = r2_score(y_hat_test, y_test)\n", + "print(f'Training median absolute error: {train_mae}')\n", + "print(f'Training r-squared coefficient: {train_r2}')\n", + "print(f'Testing median absolute error: {test_mae}')\n", + "print(f'Testing r-squared coefficient: {test_r2}')\n", + "\n", + "plt.clf()\n", + "plt.xlabel('Experimental CP Value (deg. C)')\n", + "plt.ylabel('Predicted CP Value (deg. C)')\n", + "plt.scatter(y_train, y_hat_train, color='blue', label='Training Set')\n", + "plt.scatter(y_test, y_hat_test, color='red', label='Testing Set')\n", + "plt.legend(loc='upper left')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/setup.py b/setup.py index df8a239..f91321c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='ecnet', - version='4.0.0', + version='4.1.0', description='Fuel property prediction using QSPR descriptors', url='https://github.com/ecrl/ecnet', author='Travis Kessler', diff --git a/tests/test_all.py b/tests/test_all.py index d138d0a..99d18cb 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -257,7 +257,7 @@ def test_tune_batch_size(self): targets = [[5.0]] ds_eval = QSPRDataset(smiles, targets, backend=_BACKEND) model = ECNet(_N_DESC, 1, 5, 1) - res = tune_batch_size(1, 1, _N_PROCESSES, model=model, train_ds=ds_train, eval_ds=ds_eval) + res = tune_batch_size(1, 1, ds_train, ds_eval, _N_PROCESSES) self.assertTrue(1 <= res['batch_size'] <= len(ds_train.target_vals)) def test_tune_model_architecture(self): @@ -270,8 +270,7 @@ def test_tune_model_architecture(self): targets = [[5.0]] ds_eval = QSPRDataset(smiles, targets, backend=_BACKEND) model = ECNet(_N_DESC, 1, 5, 1) - res = tune_model_architecture(1, 1, _N_PROCESSES, model=model, train_ds=ds_train, - eval_ds=ds_eval) + res = tune_model_architecture(1, 1, ds_train, ds_eval, _N_PROCESSES) for k in list(res.keys()): self.assertTrue(res[k] >= CONFIG['architecture_params_range'][k][0]) self.assertTrue(res[k] <= CONFIG['architecture_params_range'][k][1]) @@ -285,12 +284,8 @@ def test_tune_training_hps(self): smiles = ['CCCCC'] targets = [[5.0]] ds_eval = QSPRDataset(smiles, targets, backend=_BACKEND) - model = ECNet(_N_DESC, 1, 5, 1) - res = tune_training_parameters(1, 1, _N_PROCESSES, model=model, train_ds=ds_train, - eval_ds=ds_eval) + res = tune_training_parameters(1, 1, ds_train, ds_eval, _N_PROCESSES) for k in list(res.keys()): - if k == 'betas': - continue self.assertTrue(res[k] >= CONFIG['training_params_range'][k][0]) self.assertTrue(res[k] <= CONFIG['training_params_range'][k][1])