diff --git a/README.md b/README.md index 97990ad..aa313e7 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ def get_model(param): return model ``` -The optimization process is easily trackable using the callbacks provided by Keras. At the end of the procedure, you can access to all you need querying the keras-hypetune searcher. The best solutions can be automatically saved in proper locations. +The optimization process is easily trackable using the callbacks provided by Keras. At the end of the searching, you can access all you need by querying the keras-hypetune searcher. The best solutions can be automatically saved in proper locations. ## Installation diff --git a/kerashypetune/kerashypetune.py b/kerashypetune/kerashypetune.py index 81293e1..40ebc46 100644 --- a/kerashypetune/kerashypetune.py +++ b/kerashypetune/kerashypetune.py @@ -1,222 +1,127 @@ import random import numpy as np -from itertools import product +from copy import deepcopy -from .utils import (ParameterSampler, _check_param, _safeformat_str, _get_callback_paths, - _clear_callbacks, _create_fold, _check_data) +from .utils import (ParameterSampler, _check_param, _check_data, + _clear_callbacks, _create_fold) +class _KerasSearch: + """Base class for KerasSearch meta-estimator. -class KerasGridSearch(object): - + Warning: This class should not be used directly. Use derived classes + instead. """ - Grid hyperparamater searching and optimization on a fixed validation set. - - Pass a Keras model (in Sequential or Functional format), and - a dictionary with the parameter boundaries for the experiment. - For searching, takes in the same arguments available in Keras model.fit(...). - All the input format supported by Keras model are accepted. - - - Parameters - ---------- - hypermodel : function - A callable that takes parameters in dict format and returns a TF Model instance. - param_grid : dict - Hyperparameters to try, 1-to-1 mapped with the parameters dict keys present - in the hypermodel function. - monitor : str, default val_loss - Quantity to monitor in order to detect the best model. - greater_is_better : bool, default False - Whether the quantity to monitor is a score function, meaning high is good, - or a loss function (as default), meaning low is good. - store_model : bool, default True - If True the best model is stored inside the KerasGridSearch object. - savepath : str, default None - String or path-like, path to save the best model file. If None, no saving is applied. - tuner_verbose : int, default 1 - 0 or 1. Verbosity mode. 0 = silent, 1 = print trial logs with the connected score. - - - Attributes - ---------- - trials : list - A list of dicts. The dicts are all the hyperparameter combinations tried and - derived from the param_grid - scores : list - The monitor quantities achived on the validation data by all the models tried. - best_params : dict, default None - The dict containing the best combination (in term of score) of hyperparameters. - best_score : float, default None - The best score achieved by all the possible combination created. - best_model : TF Model, default None - The best model (in term of score). Accessible only if store_model is set to True. - - - Notes - ---------- - KerasGridSearch allows the usage of every callbacks available in Keras (also the - custom one). The callbacks, that provide the possibility to save any output as - external files, support naming formatting options. This is true for ModelCheckpoint, - CSVLogger, TensorBoard and RemoteMonitor. 'trial' is the custom token that can be used - to personalize the name formatting. - - For example: if filepath in ModelCheckpoint is model_{trial}.hdf5, then the model - checkpoints will be saved with the relative number of trial in the filename. - This enables to save and differentiate each model created in the searching trials. - """ - def __init__(self, hypermodel, param_grid, + n_iter=None, + sampling_seed=None, monitor='val_loss', greater_is_better=False, store_model=True, savepath=None, tuner_verbose=1): - + self.hypermodel = hypermodel self.param_grid = param_grid + self.n_iter = n_iter + self.sampling_seed = sampling_seed self.monitor = monitor self.greater_is_better = greater_is_better self.store_model = store_model self.savepath = savepath self.tuner_verbose = tuner_verbose + + def __repr__(self): + return "".format(self.__class__.__name__) + + def __str__(self): + return "".format(self.__class__.__name__) + + def _search(self, + x, y=None, + validation_data=None, + validation_split=0.0, + is_random=False, + id_fold=None, + **fitargs): + """Private method to perform a search on a fixed validation set for + the best parameters configuration.""" + self.trials = [] self.scores = [] - self.best_params = None - self.best_score = None - self.best_model = None - - - def set_seed(self, - seed_fun, - **seedargs): - - """ - Pass a function to set the seed in every trial: optional. - - Parameters - ---------- - seed_fun : callable, default None - Function used to set the seed in each trial. - seedargs : Additional arguments of seed_fun. - - Examples - -------- - >>> def seed_setter(seed): - >>> tf.random.set_seed(seed) - >>> os.environ['PYTHONHASHSEED'] = str(seed) - >>> np.random.seed(seed) - >>> random.seed(seed) - >>> - >>> kgs = KerasGridSearch(...) - >>> kgs.set_seed(seed_setter, seed=1234) - >>> kgs.search(...) - """ - - if not callable(seed_fun): - raise ValueError("seed_fun must be a callable function") - - self.seed_fun = seed_fun - self.seedargs = seedargs - - - def search(self, - x, - y = None, - validation_data = None, - validation_split = 0.0, - **fitargs): - - """ - Performs a search for best hyperparameter configurations creating - all the possible trials and evaluating on the validation set provided. - - Parameters - ---------- - x : multi types - Input data. All the input format supported by Keras model are accepted. - y : multi types, default None - Target data. All the target format supported by Keras model are accepted. - validation_data : multi types, default None - Data on which to evaluate the loss and any model metrics at the end of each epoch. - All the validation_data format supported by Keras model are accepted. - validation_split : float, default 0.0 - Float between 0 and 1. Fraction of the training data to be used as validation data. - **fitargs : Additional fitting arguments, the same accepted in Keras model.fit(...). - """ - - # retrive utility params from CV process (if applied) - fold = self._fold if hasattr(self, '_fold') else '' - callback_paths = (self._callback_paths if hasattr(self, '_callback_paths') - else '') - + if validation_data is None and validation_split == 0.0: - raise ValueError("Pass at least one of validation_data or validation_split") - + raise ValueError( + "Pass at least one of validation_data or validation_split.") + if not isinstance(self.param_grid, dict): - raise ValueError("Pass param_grid in dict format") - self.param_grid = self.param_grid.copy() - - tunable_fitargs = ['batch_size', 'epochs', 'steps_per_epoch', 'class_weight'] - - if 'callbacks' in fitargs.keys() and fold == '': - callback_paths = _get_callback_paths(fitargs['callbacks']) - - for p_k, p_v in self.param_grid.items(): - self.param_grid[p_k] = _check_param(p_v) - - start_score = -np.inf if self.greater_is_better else np.inf - self.best_score = start_score + raise ValueError("Pass param_grid in dict format.") + self._param_grid = self.param_grid.copy() + + tunable_fitargs = ['batch_size', 'epochs', + 'steps_per_epoch', 'class_weight'] + + for p_k, p_v in self._param_grid.items(): + self._param_grid[p_k] = _check_param(p_v) eval_epoch = np.argmax if self.greater_is_better else np.argmin eval_score = np.max if self.greater_is_better else np.min - - total_trials = np.prod([len(p) for p in self.param_grid.values()]) - verbose = fitargs['verbose'] if 'verbose' in fitargs.keys() else 0 - - if self.tuner_verbose == 1: - print(f"\n{total_trials} trials detected for {tuple(self.param_grid.keys())}") - - for trial,param in enumerate(product(*self.param_grid.values())): - - if hasattr(self, 'seed_fun'): - self.seed_fun(**self.seedargs) - - if 'callbacks' in fitargs.keys(): - fitargs['callbacks'] = _clear_callbacks(fitargs['callbacks'], - callback_paths, - trial+1, fold, - start_score) - - param = dict(zip(self.param_grid.keys(), param)) + start_score = -np.inf if self.greater_is_better else np.inf + self.best_score = start_score + + rs = ParameterSampler(n_iter=self.n_iter, + param_distributions=self._param_grid, + random_state=self.sampling_seed, + is_random=is_random) + self._param_combi = rs.sample() + + if 'callbacks' in fitargs: + if isinstance(fitargs['callbacks'], list): + _callbacks = deepcopy(fitargs['callbacks']) + else: + _callbacks = deepcopy([fitargs['callbacks']]) + + if self.tuner_verbose > 0: + print("\n{} trials detected for {}".format( + len(self._param_combi), tuple(self._param_grid.keys()))) + verbose = fitargs['verbose'] if 'verbose' in fitargs else 0 + else: + verbose = 0 + fitargs['verbose'] = verbose + + for trial, param in enumerate(self._param_combi): + + if 'callbacks' in fitargs: + fitargs['callbacks'] = _clear_callbacks( + deepcopy(_callbacks), trial + 1, id_fold) + + param = dict(zip(self._param_grid.keys(), param)) model = self.hypermodel(param) - - fit_param = {k:v for k,v in param.items() if k in tunable_fitargs} + + fit_param = {k: v for k, v in param.items() if k in tunable_fitargs} all_fitargs = dict(list(fitargs.items()) + list(fit_param.items())) - - if self.tuner_verbose == 1: - print(f"\n***** ({trial+1}/{total_trials}) *****\nSearch({param})") - else: - verbose = 0 - all_fitargs['verbose'] = verbose - - model.fit(x = x, - y = y, - validation_split = validation_split, - validation_data = validation_data, + + if self.tuner_verbose > 0: + print("\n***** ({}/{}) *****\nSearch({})".format( + trial + 1, len(self._param_combi), param)) + + model.fit(x=x, + y=y, + validation_split=validation_split, + validation_data=validation_data, **all_fitargs) - + epoch = eval_epoch(model.history.history[self.monitor]) - param['epochs'] = epoch+1 + param['epochs'] = epoch + 1 param['steps_per_epoch'] = model.history.params['steps'] - param['batch_size'] = (all_fitargs['batch_size'] if 'batch_size' - in all_fitargs.keys() else None) - score = np.round(model.history.history[self.monitor][epoch],5) + param['batch_size'] = (all_fitargs['batch_size'] if 'batch_size' + in all_fitargs else None) + score = round(model.history.history[self.monitor][epoch], 5) evaluate = eval_score([self.best_score, score]) - + if self.best_score != evaluate: self.best_params = param @@ -225,356 +130,528 @@ def search(self, self.best_model = model if self.savepath is not None: - model.save(self.savepath.format(fold=fold)) - + if id_fold is not None: + model.save(self.savepath.replace('{fold}', str(id_fold))) + else: + model.save(self.savepath) + self.best_score = evaluate self.trials.append(param) self.scores.append(score) - - if self.tuner_verbose == 1: - print(f"SCORE: {score} at epoch {epoch+1}") + if self.tuner_verbose > 0: + print("SCORE: {} at epoch {}".format(score, epoch + 1)) + + return self + + +class KerasGridSearch(_KerasSearch): + """Grid hyperparamater searching and optimization on a fixed + validation set. + + Pass a Keras model (in Sequential or Functional format), and + a dictionary with the parameter boundaries for the experiment. + For searching, takes in the same arguments available in Keras + model.fit(...). All the input format supported by Keras model + are accepted. + + Parameters + ---------- + hypermodel : callable + A callable that takes parameters in dict format and returns a + TF Model instance. + + param_grid : dict + Hyperparameters to try, 1-to-1 mapped with the parameters dict + keys present in the hypermodel function. + + monitor : str, default='val_loss' + Quantity to monitor in order to detect the best model. + + greater_is_better : bool, default=False + Whether the quantity to monitor is a score function, meaning high + is good, or a loss function (as default), meaning low is good. + + store_model : bool, default=True + If True the best model is stored in the KerasGridSearch object. + + savepath : str, default=None + String or path-like, path to save the best model file. + If None, no saving is applied. + tuner_verbose : int, default=1 + Verbosity mode. <=0 silent all; >0 print trial logs with the + connected score. -class KerasRandomSearch(object): - + Attributes + ---------- + trials : list + A list of dicts. The dicts are all the hyperparameter combinations + tried and derived from the param_grid. + + scores : list + The monitor quantities achived on the validation data by all the + models tried. + + best_params : dict + The dict containing the best combination (in term of score) of + hyperparameters. + + best_score : float + The best score achieved by all the possible combination created. + + best_model : TF Model + The best model (in term of score). Accessible only if store_model + is set to True. + + Notes + ---------- + KerasGridSearch allows the usage of every callbacks available in Keras + (also the custom one). The callbacks, that provide the possibility to + save any output as external files, support naming formatting options. + This is true for ModelCheckpoint, CSVLogger, TensorBoard and RemoteMonitor. + 'trial' is the custom token that can be used to personalize name formatting. + + For example: if filepath in ModelCheckpoint is model_{trial}.hdf5, then + the model checkpoints will be saved with the relative number of trial in + the filename. This enables to save and differentiate each model created in + the searching trials. """ - Random hyperparamater searching and optimization on a fixed validation set. - - Pass a Keras model (in Sequential or Functional format), and + + def __init__(self, + hypermodel, + param_grid, + monitor='val_loss', + greater_is_better=False, + store_model=True, + savepath=None, + tuner_verbose=1): + self.hypermodel = hypermodel + self.param_grid = param_grid + self.monitor = monitor + self.greater_is_better = greater_is_better + self.store_model = store_model + self.savepath = savepath + self.tuner_verbose = tuner_verbose + self.n_iter = None + self.sampling_seed = None + + def search(self, + x, y=None, + validation_data=None, + validation_split=0.0, + **fitargs): + """Performs a search for best hyperparameter configurations creating + all the possible trials and evaluating on the validation set provided. + + Parameters + ---------- + x : multi types + Input data. All the input formats supported by Keras model are + accepted. + + y : multi types, default=None + Target data. All the target formats supported by Keras model are + accepted. + + validation_data : multi types, default=None + Data on which evaluate the loss and any model metrics at the end of + each epoch. All the validation_data formats supported by Keras model + are accepted. + + validation_split : float, default=0.0 + Float between 0 and 1. Fraction of the training data to be used as + validation data. + + **fitargs : Additional fitting arguments, the same accepted in Keras + model.fit(...). + + Returns + ------- + self : object + """ + + self._search(x=x, y=y, + validation_data=validation_data, + validation_split=validation_split, + is_random=False, + **fitargs) + + return self + + +class KerasRandomSearch(_KerasSearch): + """Random hyperparamater searching and optimization on a fixed + validation set. + + Pass a Keras model (in Sequential or Functional format), and a dictionary with the parameter boundaries for the experiment. - - In contrast to grid-search, not all parameter values are tried out, - but rather a fixed number of parameter settings is sampled from - the specified distributions. The number of parameter settings that - are tried is given by n_iter. + For searching, takes in the same arguments available in Keras + model.fit(...). All the input format supported by Keras model + are accepted. - If all parameters are presented as a list, sampling without replacement - is performed. If at least one parameter is given as a distribution - (random variable from scipy.stats.distribution), sampling with replacement - is used. It is highly recommended to use continuous distributions - for continuous parameters. + In contrast to grid-search, not all parameter values are tried out, + but rather a fixed number of parameter settings is sampled from + the specified distributions. The number of parameter settings that + are tried is given by n_iter. + If all parameters are presented as a list/floats/integers, sampling + without replacement is performed. If at least one parameter is given + as a distribution (random variable from scipy.stats.distribution), + sampling with replacement is used. It is highly recommended to use + continuous distributions for continuous parameters. - For searching, takes in the same arguments available in Keras model.fit(...). - All the input format supported by Keras model are accepted. - - Parameters ---------- - hypermodel : function - A callable that takes parameters in dict format and returns a TF Model instance. + hypermodel : callable + A callable that takes parameters in dict format and returns a + TF Model instance. + param_grid : dict - Hyperparameters to try, 1-to-1 mapped with the parameters dict keys present - in the hypermodel function. + Hyperparameters to try, 1-to-1 mapped with the parameters dict + keys present in the hypermodel function. + n_iter : int - Number of parameter settings that are sampled. + Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution. - sampling_seed : int, default 0 + + sampling_seed : int, default=0 The seed used to sample from the hyperparameter distributions. - monitor : str, default val_loss + + monitor : str, default='val_loss' Quantity to monitor in order to detect the best model. - greater_is_better : bool, default False - Whether the quantity to monitor is a score function, meaning high is good, - or a loss function (as default), meaning low is good. - store_model : bool, default True - If True the best model is stored inside the KerasRandomSearch object. - savepath : str, default None - String or path-like, path to save the best model file. If None, no saving is applied. - tuner_verbose : int, default 1 - 0 or 1. Verbosity mode. 0 = silent, 1 = print trial logs with the connected score. - - + + greater_is_better : bool, default=False + Whether the quantity to monitor is a score function, meaning high + is good, or a loss function (as default), meaning low is good. + + store_model : bool, default=True + If True the best model is stored in the KerasGridSearch object. + + savepath : str, default=None + String or path-like, path to save the best model file. + If None, no saving is applied. + + tuner_verbose : int, default=1 + Verbosity mode. <=0 silent all; >0 print trial logs with the + connected score. + Attributes ---------- trials : list - A list of dicts. The dicts are all the hyperparameter combinations tried and - derived from the param_grid - scores : list - The monitor quantities achived on the validation data by all the models tried. - best_params : dict, default None - The dict containing the best combination (in term of score) of hyperparameters. - best_score : float, default None + A list of dicts. The dicts are all the hyperparameter combinations + tried and derived from the param_grid. + + scores : list + The monitor quantities achived on the validation data by all the + models tried. + + best_params : dict + The dict containing the best combination (in term of score) of + hyperparameters. + + best_score : float The best score achieved by all the possible combination created. - best_model : TF Model, default None - The best model (in term of score). Accessible only if store_model is set to True. - - + + best_model : TF Model + The best model (in term of score). Accessible only if store_model + is set to True. + Notes ---------- - KerasRandomSearch allows the usage of every callbacks available in Keras (also the - custom one). The callbacks, that provide the possibility to save any output as - external files, support naming formatting options. This is true for ModelCheckpoint, - CSVLogger, TensorBoard and RemoteMonitor. 'trial' is the custom token that can be used - to personalize the name formatting. - - For example: if filepath in ModelCheckpoint is model_{trial}.hdf5, then the model - checkpoints will be saved with the relative number of trial in the filename. - This enables to save and differentiate each model created in the searching trials. + KerasGridSearch allows the usage of every callbacks available in Keras + (also the custom one). The callbacks, that provide the possibility to + save any output as external files, support naming formatting options. + This is true for ModelCheckpoint, CSVLogger, TensorBoard and RemoteMonitor. + 'trial' is the custom token that can be used to personalize name formatting. + + For example: if filepath in ModelCheckpoint is model_{trial}.hdf5, then + the model checkpoints will be saved with the relative number of trial in + the filename. This enables to save and differentiate each model created in + the searching trials. """ - + def __init__(self, hypermodel, param_grid, n_iter, - sampling_seed = 0, - monitor ='val_loss', - greater_is_better = False, - store_model = True, - savepath = None, - tuner_verbose = 1): - + sampling_seed=0, + monitor='val_loss', + greater_is_better=False, + store_model=True, + savepath=None, + tuner_verbose=1): self.hypermodel = hypermodel self.param_grid = param_grid self.n_iter = n_iter - self.sampling_seed = sampling_seed + self.sampling_seed = sampling_seed self.monitor = monitor self.greater_is_better = greater_is_better self.store_model = store_model self.savepath = savepath self.tuner_verbose = tuner_verbose - self.trials = [] - self.scores = [] - self.best_params = None - self.best_score = None - self.best_model = None - - - def set_seed(self, - seed_fun, - **seedargs): - - """ - Pass a function to set the seed in every trial: optional. - - Parameters - ---------- - seed_fun : callable, default None - Function used to set the seed in each trial. - seedargs : Additional arguments of seed_fun. - - Examples - -------- - >>> def seed_setter(seed): - >>> tf.random.set_seed(seed) - >>> os.environ['PYTHONHASHSEED'] = str(seed) - >>> np.random.seed(seed) - >>> random.seed(seed) - >>> - >>> kgs = KerasRandomSearch(...) - >>> kgs.set_seed(seed_setter, seed=1234) - >>> kgs.search(...) - """ - - if not callable(seed_fun): - raise ValueError("seed_fun must be a callable function") - - self.seed_fun = seed_fun - self.seedargs = seedargs - - - def search(self, - x, - y = None, - validation_data = None, - validation_split = 0.0, + + def search(self, + x, y=None, + validation_data=None, + validation_split=0.0, **fitargs): - - """ - Performs a search for best hyperparameter configurations creating + """Performs a search for best hyperparameter configurations creating all the possible trials and evaluating on the validation set provided. - + Parameters - ---------- + ---------- x : multi types - Input data. All the input format supported by Keras model are accepted. - y : multi types, default None - Target data. All the target format supported by Keras model are accepted. - validation_data : multi types, default None - Data on which to evaluate the loss and any model metrics at the end of each epoch. - All the validation_data format supported by Keras model are accepted. - validation_split : float, default 0.0 - Float between 0 and 1. Fraction of the training data to be used as validation data. - **fitargs : Additional fitting arguments, the same accepted in Keras model.fit(...). - """ - - # retrive utility params from CV process (if applied) - fold = self._fold if hasattr(self, '_fold') else '' - callback_paths = (self._callback_paths if hasattr(self, '_callback_paths') - else '') - - if validation_data is None and validation_split == 0.0: - raise ValueError("Pass at least one of validation_data or validation_split") - - if not isinstance(self.param_grid, dict): - raise ValueError("Pass param_grid in dict format") - self.param_grid = self.param_grid.copy() - - tunable_fitargs = ['batch_size', 'epochs', 'steps_per_epoch', 'class_weight'] - - if 'callbacks' in fitargs.keys() and fold == '': - callback_paths = _get_callback_paths(fitargs['callbacks']) - - start_score = -np.inf if self.greater_is_better else np.inf - self.best_score = start_score + Input data. All the input formats supported by Keras model are + accepted. - eval_epoch = np.argmax if self.greater_is_better else np.argmin - eval_score = np.max if self.greater_is_better else np.min - - verbose = fitargs['verbose'] if 'verbose' in fitargs.keys() else 0 - - rs = ParameterSampler(n_iter = self.n_iter, - param_distributions = self.param_grid, - random_state = self.sampling_seed) - sampled_params = rs.sample() - - if self.tuner_verbose == 1: - print(f"\n{self.n_iter} trials detected for {tuple(self.param_grid.keys())}") - - for trial,param in enumerate(sampled_params): - - if hasattr(self, 'seed_fun'): - self.seed_fun(**self.seedargs) - - if 'callbacks' in fitargs.keys(): - fitargs['callbacks'] = _clear_callbacks(fitargs['callbacks'], - callback_paths, - trial+1, fold, - start_score) - - param = dict(zip(self.param_grid.keys(), param)) - model = self.hypermodel(param) - - fit_param = {k:v for k,v in param.items() if k in tunable_fitargs} - all_fitargs = dict(list(fitargs.items()) + list(fit_param.items())) - - if self.tuner_verbose == 1: - print(f"\n***** ({trial+1}/{self.n_iter}) *****\nSearch({param})") - else: - verbose = 0 - all_fitargs['verbose'] = verbose - - model.fit(x = x, - y = y, - validation_split = validation_split, - validation_data = validation_data, - **all_fitargs) - - epoch = eval_epoch(model.history.history[self.monitor]) - param['epochs'] = epoch+1 - param['steps_per_epoch'] = model.history.params['steps'] - param['batch_size'] = (all_fitargs['batch_size'] if 'batch_size' - in all_fitargs.keys() else None) - score = np.round(model.history.history[self.monitor][epoch],5) - evaluate = eval_score([self.best_score, score]) - - if self.best_score != evaluate: + y : multi types, default=None + Target data. All the target formats supported by Keras model are + accepted. - self.best_params = param + validation_data : multi types, default=None + Data on which evaluate the loss and any model metrics at the end of + each epoch. All the validation_data formats supported by Keras model + are accepted. - if self.store_model: - self.best_model = model + validation_split : float, default=0.0 + Float between 0 and 1. Fraction of the training data to be used as + validation data. - if self.savepath is not None: - model.save(self.savepath.format(fold=fold)) - - self.best_score = evaluate - self.trials.append(param) - self.scores.append(score) - - if self.tuner_verbose == 1: - print(f"SCORE: {score} at epoch {epoch+1}") - + **fitargs : Additional fitting arguments, the same accepted in Keras + model.fit(...). + + Returns + ------- + self : object + """ + + self._search(x=x, y=y, + validation_data=validation_data, + validation_split=validation_split, + is_random=True, + **fitargs) + + return self -class KerasGridSearchCV(object): - +class _KerasSearchCV: + """Base class for KerasSearchCV meta-estimator. + + Warning: This class should not be used directly. Use derived classes + instead. """ - Grid hyperparamater searching and optimization with cross-validation. - - Pass a Keras model (in Sequential or Functional format), and + + def __init__(self, + hypermodel, + param_grid, + cv, + n_iter=None, + sampling_seed=None, + monitor='val_loss', + greater_is_better=False, + store_model=True, + savepath=None, + tuner_verbose=1): + + self.hypermodel = hypermodel + self.param_grid = param_grid + self.cv = cv + self.n_iter = n_iter + self.sampling_seed = sampling_seed + self.monitor = monitor + self.greater_is_better = greater_is_better + self.store_model = store_model + self.savepath = savepath + self.tuner_verbose = tuner_verbose + + def __repr__(self): + return "".format(self.__class__.__name__) + + def __str__(self): + return "".format(self.__class__.__name__) + + def _search(self, + x, y=None, + sample_weight=None, + groups=None, + is_random=False, + **fitargs): + """Private method to perform a CV search for the best parameters + configuration.""" + + self.folds_trials = {} + self.folds_scores = {} + self.folds_best_params = {} + self.folds_best_score = {} + if self.store_model: + self.folds_best_models = {} + + if 'validation_split' in fitargs or 'validation_data' in fitargs: + raise ValueError( + "Validation is automatically created by the cv strategy.") + + if not hasattr(self.cv, 'split'): + raise ValueError( + "Expected cv as cross-validation object with split method to " + "generate indices to split data into training and test set " + "(like from sklearn.model_selection).") + + _check_data(x) + if y is not None: _check_data(y) + if sample_weight is not None: _check_data(sample_weight) + + for fold, (train_id, val_id) in enumerate(self.cv.split(x, y, groups)): + + if self.tuner_verbose > 0: + print("\n{}\n{} Fold {} {}\n{}".format( + '#' * 18, '#' * 3, str(fold + 1).zfill(3), '#' * 3, '#' * 18)) + + x_train = _create_fold(x, train_id) + y_train = None if y is None else _create_fold(y, train_id) + sample_weight_train = (_create_fold(sample_weight, train_id) + if sample_weight is not None else None) + + x_val = _create_fold(x, val_id) + y_val = None if y is None else _create_fold(y, val_id) + sample_weight_val = (_create_fold(sample_weight, val_id) + if sample_weight is not None else None) + + ks_fold = _KerasSearch(hypermodel=self.hypermodel, + param_grid=self.param_grid, + n_iter=self.n_iter, + sampling_seed=self.sampling_seed, + monitor=self.monitor, + greater_is_better=self.greater_is_better, + store_model=self.store_model, + savepath=self.savepath, + tuner_verbose=self.tuner_verbose) + + ks_fold._search(x=x_train, y=y_train, + sample_weight=sample_weight_train, + validation_data=(x_val, y_val, sample_weight_val), + is_random=is_random, + id_fold=fold + 1, + **fitargs) + + fold_id = "fold {}".format(fold + 1) + self.folds_trials[fold_id] = ks_fold.trials + self.folds_scores[fold_id] = ks_fold.scores + self.folds_best_params[fold_id] = ks_fold.best_params + if self.store_model: + self.folds_best_models[fold_id] = ks_fold.best_model + self.folds_best_score[fold_id] = ks_fold.best_score + + eval_score = np.argmax if self.greater_is_better else np.argmin + mean_score_params = np.mean( + list(self.folds_scores.values()), axis=0).round(5) + evaluate = eval_score(mean_score_params) + + self.best_params = [list(f)[evaluate] for f in self.folds_trials.values()] + self.best_params_score = mean_score_params[evaluate] + + return self + + +class KerasGridSearchCV(_KerasSearchCV): + """Grid hyperparamater searching and optimization on a fixed + validation set. + + Pass a Keras model (in Sequential or Functional format), and a dictionary with the parameter boundaries for the experiment. - The cross-validation strategies are the same provided by the - scikit-learn cross-validation generator. - - For searching, takes in the same arguments available in Keras model.fit(...). + For searching, takes in the same arguments available in Keras + model.fit(...). The cross-validation strategies are the same + provided by the scikit-learn cross-validation generator. Only input in array format are supported. In case of multi-input or - multi-output is it possible to wrap arrays in list or dictionaries like in - Keras. - - + multi-output is it possible to wrap arrays in list or dictionaries + like in Keras. + Parameters ---------- - hypermodel : function - A callable that takes parameters in dict format and returns a TF Model instance. + hypermodel : callable + A callable that takes parameters in dict format and returns a + TF Model instance. + param_grid : dict - Hyperparameters to try, 1-to-1 mapped with the parameters dict keys present - in the hypermodel function. + Hyperparameters to try, 1-to-1 mapped with the parameters dict + keys present in the hypermodel function. + cv : scikit-learn cross-validation generator - An sklearn.model_selection splitter class. Used to determine how samples - are split up into groups for cross-validation. - monitor : str, default val_loss - Quantity to monitor in order to detect the best models. - greater_is_better : bool, default False - Whether the quantity to monitor is a score function, meaning high is good, - or a loss function (as default), meaning low is good. - store_model : bool, default True - If True the best models are stored inside the KerasGridSearchCV object. The best model - of each fold is stored. - savepath : str, default None - String or path-like, path to save the best model files. If None, no saving is applied. - savepath can contain named formatting options ('fold' is a special useful key). - For example: if filepath is model_{fold}.h5, then the best model of each fold is saved - with the number of the relative fold in the name. - tuner_verbose : int, default 1 - 0 or 1. Verbosity mode. 0 = silent, 1 = print trial logs with the connected score. - - + An sklearn.model_selection splitter class. Used to determine + how samples are split up into groups for cross-validation. + + monitor : str, default='val_loss' + Quantity to monitor in order to detect the best model. + + greater_is_better : bool, default=False + Whether the quantity to monitor is a score function, meaning high + is good, or a loss function (as default), meaning low is good. + + store_model : bool, default=True + If True the best model of each fold is stored inside the + KerasGridSearchCV object. + + savepath : str, default=None + String or path-like, path to save the best model file. + If None, no saving is applied. + + tuner_verbose : int, default=1 + Verbosity mode. <=0 silent all; >0 print trial logs with the + connected score. + Attributes ---------- folds_trials : dict - A dicts of list. The lists contain all the hyperparameter combinations tried - in each fold and derived from the param_grid. + A dicts of list. The lists contain all the hyperparameter combinations + tried in each fold and derived from the param_grid. + folds_scores : dict - A dicts of list. The lists contain the monitor quantities achived on the - validation data by all the models tried in each fold. + A dicts of list. The lists contain the monitor quantities achived on + the validation data by all the models tried in each fold. + folds_best_params : dict - The dict containing the best combination (in term of score) of hyperparameters - in each fold. + The dict containing the best combination (in term of score) of + hyperparameters in each fold. + folds_best_score : dict - The best scores achieved by all the possible combination created in each fold. + The best scores achieved by all the possible combination created in + each fold. + folds_best_model : dict - The best models (in term of score) in each fold. Accessible only if store_model - is set to True. - best_params_score : float, default None + The best models (in term of score) in each fold. Accessible only if + store_model is set to True. + + best_params_score : float The best average score in all the available folds. - best_params : dict, default None - The paramareter combination related to the best average score - in all the available folds. - + + best_params : dict + The paramareter combination related to the best average score in all + the available folds. + + Notes ---------- - KerasGridSearchCV allows the usage of every callbacks available in Keras (also the - custom one). The callbacks, that provide the possibility to save any output as - external files, support naming formatting options. This is true for ModelCheckpoint, - CSVLogger, TensorBoard and RemoteMonitor. 'trial' and 'fold' are custom tokens that - can be used to personalize the name formatting. - - For example: if filepath in ModelCheckpoint is model_{fold}_{trial}.hdf5, then - the model checkpoints will be saved with the relative number of trial, obtained at - a certain fold, in the filename. This enables to save and differentiate each model - created in the searching trials. + KerasGridSearchCV allows the usage of every callbacks available in Keras + (also the custom one). The callbacks, that provide the possibility to + save any output as external files, support naming formatting options. + This is true for ModelCheckpoint, CSVLogger, TensorBoard and RemoteMonitor. + 'trial' and 'fold' are custom tokens that can be used to personalize the + name formatting. + + For example: if filepath in ModelCheckpoint is model_{fold}_{trial}.hdf5, + then the model checkpoints will be saved with the relative number of trial, + obtained at a certain fold, in the filename. This enables to save and + differentiate each model created in the searching trials. """ - + def __init__(self, hypermodel, param_grid, cv, - monitor = 'val_loss', - greater_is_better = False, - store_model = True, - savepath = None, - tuner_verbose = 1): - + monitor='val_loss', + greater_is_better=False, + store_model=True, + savepath=None, + tuner_verbose=1): self.hypermodel = hypermodel self.param_grid = param_grid self.cv = cv @@ -583,246 +660,174 @@ def __init__(self, self.store_model = store_model self.savepath = savepath self.tuner_verbose = tuner_verbose - self.folds_trials = {} - self.folds_scores = {} - self.folds_best_params = {} - self.folds_best_score = {} - self.folds_best_models = {} - self.best_params_score = None - self.best_params = None - - - def set_seed(self, - seed_fun, - **seedargs): - - """ - Pass a function to set the seed in every trial: optional. - - Parameters - ---------- - seed_fun : callable, default None - Function used to set the seed in each trial. - seedargs : Additional arguments of seed_fun. - - Examples - -------- - >>> def seed_setter(seed): - >>> tf.random.set_seed(seed) - >>> os.environ['PYTHONHASHSEED'] = str(seed) - >>> np.random.seed(seed) - >>> random.seed(seed) - >>> - >>> kgs = KerasGridSearchCV(...) - >>> kgs.set_seed(seed_setter, seed=1234) - >>> kgs.search(...) - """ + self.n_iter = None + self.sampling_seed = None + + def search(self, + x, y=None, + sample_weight=None, + groups=None, + **fitargs): + """Performs a search for best hyperparameter configurations creating + all the possible trials and evaluating on the validation folds + created following the validation strategy. - if not callable(seed_fun): - raise ValueError("seed_fun must be a callable function") - - self.seed_fun = seed_fun - self.seedargs = seedargs - - - def search(self, - x, - y, - sample_weight = None, - groups = None, - **fitargs): - - """ - Performs a search for best hyperparameter configurations creating - all the possible trials and evaluating on the validation folder created - following the validation strategy. - Parameters - ---------- + ---------- x : multi types - Input data. Accepted types are arrays or list/dict in case of multi-input/output. - y : multi types, default None - Target data. Accepted types are arrays or list/dict in case of multi-input/output. - sample_weight : multi types, default None - Optional Numpy array of weights for the training samples, used for weighting - the loss function (during training only). Accepted types are arrays or - list/dict in case of multi-input/output - groups : array-like, default None - Group labels for the samples used while splitting the dataset into train/valid set. - **fitargs : Additional fitting arguments, the same accepted in Keras model.fit(...). - The validation set is automatically created accordingly the cv strategy. + Input data. Accepted types are arrays or list/dict in case of + multi-input/output. + + y : multi types, default=None + Target data. Accepted types are arrays or list/dict in case of + multi-input/output. + + sample_weight : multi types, default=None + Optional Numpy array of weights for the training samples, used + for weighting the loss function (during training only). Accepted + types are arrays or list/dict in case of multi-input/output. + + groups : array-like, default=None + Group labels for the samples used while splitting the dataset into + train/valid set. + + **fitargs : Additional fitting arguments, the same accepted in Keras + model.fit(...). + The validation set is automatically created accordingly to the + cv strategy. + + Returns + ------- + self : object """ - - if 'validation_split' in fitargs.keys() or 'validation_data' in fitargs.keys(): - raise ValueError("Validation is automatically created by the cv strategy") - - _check_data(x) - _check_data(y) - if sample_weight is not None: _check_data(sample_weight) - - for fold,(train_id,val_id) in enumerate(self.cv.split(x, y, groups)): - - if self.tuner_verbose == 1: - print("\n{}\n{} Fold {} {}\n{}".format( - '#'*18, '#'*3, str(fold+1).zfill(3), '#'*3, '#'*18)) - - if 'callbacks' in fitargs.keys() and fold == 0: - callback_paths = _get_callback_paths(fitargs['callbacks']) - - x_train = _create_fold(x, train_id) - y_train = _create_fold(y, train_id) - sample_weight_train = (_create_fold(sample_weight, train_id) if sample_weight - is not None else None) - - x_val = _create_fold(x, val_id) - y_val = _create_fold(y, val_id) - sample_weight_val = (_create_fold(sample_weight, val_id) if sample_weight - is not None else None) - - kgs_fold = KerasGridSearch(hypermodel = self.hypermodel, - param_grid = self.param_grid, - monitor = self.monitor, - greater_is_better = self.greater_is_better, - store_model = self.store_model, - savepath = self.savepath, - tuner_verbose = self.tuner_verbose) - - kgs_fold._fold = fold+1 - if 'callbacks' in fitargs.keys(): - kgs_fold._callback_paths = callback_paths - - if hasattr(self, 'seed_fun'): - kgs_fold.set_seed(self.seed_fun, **self.seedargs) - - kgs_fold.search(x = x_train, - y = y_train, - sample_weight = sample_weight_train, - validation_data = (x_val, y_val, sample_weight_val), - **fitargs) - - self.folds_trials[f"fold {fold+1}"] = kgs_fold.trials - self.folds_scores[f"fold {fold+1}"] = kgs_fold.scores - self.folds_best_params[f"fold {fold+1}"] = kgs_fold.best_params - if self.store_model: - self.folds_best_models[f"fold {fold+1}"] = kgs_fold.best_model - self.folds_best_score[f"fold {fold+1}"] = kgs_fold.best_score - - eval_score = np.argmax if self.greater_is_better else np.argmin - mean_score_params = np.mean(list(self.folds_scores.values()), axis=0).round(5) - evaluate = eval_score(mean_score_params) - - self.best_params = [list(f)[evaluate] for f in self.folds_trials.values()] - self.best_params_score = mean_score_params[evaluate] - - - -class KerasRandomSearchCV(object): - - """ - Random hyperparamater searching and optimization with cross-validation. - - Pass a Keras model (in Sequential or Functional format), and + + self._search(x=x, y=y, + sample_weight=sample_weight, + groups=groups, + is_random=False, + **fitargs) + + return self + + +class KerasRandomSearchCV(_KerasSearchCV): + """Random hyperparamater searching and optimization on a fixed + validation set. + + Pass a Keras model (in Sequential or Functional format), and a dictionary with the parameter boundaries for the experiment. - The cross-validation strategies are the same provided by the - scikit-learn cross-validation generator. - - In contrast to grid-search, not all parameter values are tried out, - but rather a fixed number of parameter settings is sampled from - the specified distributions. The number of parameter settings that + For searching, takes in the same arguments available in Keras + model.fit(...). The cross-validation strategies are the same + provided by the scikit-learn cross-validation generator. + Only input in array format are supported. In case of multi-input or + multi-output is it possible to wrap arrays in list or dictionaries + like in Keras. + + In contrast to grid-search, not all parameter values are tried out, + but rather a fixed number of parameter settings is sampled from + the specified distributions. The number of parameter settings that are tried is given by n_iter. + If all parameters are presented as a list/floats/integers, sampling + without replacement is performed. If at least one parameter is given + as a distribution (random variable from scipy.stats.distribution), + sampling with replacement is used. It is highly recommended to use + continuous distributions for continuous parameters. - If all parameters are presented as a list, sampling without replacement - is performed. If at least one parameter is given as a distribution - (random variable from scipy.stats.distribution), sampling with replacement - is used. It is highly recommended to use continuous distributions - for continuous parameters. - - For searching, takes in the same arguments available in Keras model.fit(...). - Only input in array format are supported. In case of multi-input or - multi-output is it possible to wrap arrays in list or dictionaries like in - Keras. - - Parameters ---------- - hypermodel : function - A callable that takes parameters in dict format and returns a TF Model instance. + hypermodel : callable + A callable that takes parameters in dict format and returns a + TF Model instance. + param_grid : dict - Hyperparameters to try, 1-to-1 mapped with the parameters dict keys present - in the hypermodel function. + Hyperparameters to try, 1-to-1 mapped with the parameters dict + keys present in the hypermodel function. + cv : scikit-learn cross-validation generator - An sklearn.model_selection splitter class. Used to determine how samples - are split up into groups for cross-validation. + An sklearn.model_selection splitter class. Used to determine + how samples are split up into groups for cross-validation. + n_iter : int - Number of parameter settings that are sampled. + Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution. - sampling_seed : int, default 0 + + sampling_seed : int, default=0 The seed used to sample from the hyperparameter distributions. - monitor : str, default val_loss - Quantity to monitor in order to detect the best models. - greater_is_better : bool, default False - Whether the quantity to monitor is a score function, meaning high is good, - or a loss function (as default), meaning low is good. - store_model : bool, default True - If True the best models are stored inside the KerasRandomSearchCV object. The best model - of each fold is stored. - savepath : str, default None - String or path-like, path to save the best model files. If None, no saving is applied. - savepath can contain named formatting options ('fold' is a special useful key). - For example: if filepath is model_{fold}.h5, then the best model of each fold is saved - with the number of the relative fold in the name. - tuner_verbose : int, default 1 - 0 or 1. Verbosity mode. 0 = silent, 1 = print trial logs with the connected score. - - + + monitor : str, default='val_loss' + Quantity to monitor in order to detect the best model. + + greater_is_better : bool, default=False + Whether the quantity to monitor is a score function, meaning high + is good, or a loss function (as default), meaning low is good. + + store_model : bool, default=True + If True the best model of each fold is stored inside the + KerasRandomSearchCV object. + + savepath : str, default=None + String or path-like, path to save the best model file. + If None, no saving is applied. + + tuner_verbose : int, default=1 + Verbosity mode. <=0 silent all; >0 print trial logs with the + connected score. + Attributes ---------- folds_trials : dict - A dicts of list. The lists contain all the hyperparameter combinations tried - in each fold and derived from the param_grid. + A dicts of list. The lists contain all the hyperparameter combinations + tried in each fold and derived from the param_grid. + folds_scores : dict - A dicts of list. The lists contain the monitor quantities achived on the - validation data by all the models tried in each fold. + A dicts of list. The lists contain the monitor quantities achived on + the validation data by all the models tried in each fold. + folds_best_params : dict - The dict containing the best combination (in term of score) of hyperparameters - in each fold. + The dict containing the best combination (in term of score) of + hyperparameters in each fold. + folds_best_score : dict - The best scores achieved by all the possible combination created in each fold. + The best scores achieved by all the possible combination created in + each fold. + folds_best_model : dict - The best models (in term of score) in each fold. Accessible only if store_model - is set to True. - best_params_score : float, default None + The best models (in term of score) in each fold. Accessible only if + store_model is set to True. + + best_params_score : float The best average score in all the available folds. - best_params : dict, default None - The paramareter combination related to the best average score - in all the available folds. - + + best_params : dict + The paramareter combination related to the best average score in all + the available folds. + Notes ---------- - KerasRandomSearchCV allows the usage of every callbacks available in keras (also the - custom one). The callbacks, that provide the possibility to save any output as - external files, support naming formatting options. This is true for ModelCheckpoint, - CSVLogger, TensorBoard and RemoteMonitor. 'trial' and 'fold' are custom tokens that - can be used to personalize the name formatting. - - For example: if filepath in ModelCheckpoint is model_{fold}_{trial}.hdf5, then - the model checkpoints will be saved with the relative number of trial, obtained at - a certain fold, in the filename. This enables to save and differentiate each model - created in the searching trials. + KerasRandomSearchCV allows the usage of every callbacks available in Keras + (also the custom one). The callbacks, that provide the possibility to + save any output as external files, support naming formatting options. + This is true for ModelCheckpoint, CSVLogger, TensorBoard and RemoteMonitor. + 'trial' and 'fold' are custom tokens that can be used to personalize the + name formatting. + + For example: if filepath in ModelCheckpoint is model_{fold}_{trial}.hdf5, + then the model checkpoints will be saved with the relative number of trial, + obtained at a certain fold, in the filename. This enables to save and + differentiate each model created in the searching trials. """ - + def __init__(self, hypermodel, param_grid, cv, n_iter, - sampling_seed = 0, - monitor = 'val_loss', - greater_is_better = False, - store_model = True, - savepath = None, - tuner_verbose = 1): - + sampling_seed=0, + monitor='val_loss', + greater_is_better=False, + store_model=True, + savepath=None, + tuner_verbose=1): self.hypermodel = hypermodel self.param_grid = param_grid self.cv = cv @@ -833,135 +838,49 @@ def __init__(self, self.store_model = store_model self.savepath = savepath self.tuner_verbose = tuner_verbose - self.folds_trials = {} - self.folds_scores = {} - self.folds_best_params = {} - self.folds_best_score = {} - self.folds_best_models = {} - self.best_params_score = None - self.best_params = None - - - def set_seed(self, - seed_fun, - **seedargs): - - """ - Pass a function to set the seed in every trial: optional. - - Parameters - ---------- - seed_fun : callable, default None - Function used to set the seed in each trial. - seedargs : Additional arguments of seed_fun. - - Examples - -------- - >>> def seed_setter(seed): - >>> tf.random.set_seed(seed) - >>> os.environ['PYTHONHASHSEED'] = str(seed) - >>> np.random.seed(seed) - >>> random.seed(seed) - >>> - >>> kgs = KerasRandomSearchCV(...) - >>> kgs.set_seed(seed_setter, seed=1234) - >>> kgs.search(...) - """ - if not callable(seed_fun): - raise ValueError("seed_fun must be a callable function") - - self.seed_fun = seed_fun - self.seedargs = seedargs - - - def search(self, - x, - y, - sample_weight = None, - groups = None, - **fitargs): - - """ - Performs a search for best hyperparameter configurations creating - all the possible trials and evaluating on the validation folder created - following the validation strategy. - + def search(self, + x, y=None, + sample_weight=None, + groups=None, + **fitargs): + """Performs a search for best hyperparameter configurations creating + all the possible trials and evaluating on the validation folds + created following the validation strategy. + Parameters - ---------- + ---------- x : multi types - Input data. Accepted types are arrays or list/dict in case of multi-input/output. - y : multi types, default None - Target data. Accepted types are arrays or list/dict in case of multi-input/output. - sample_weight : multi types, default None - Optional Numpy array of weights for the training samples, used for weighting - the loss function (during training only). Accepted types are arrays or - list/dict in case of multi-input/output - groups : array-like, default None - Group labels for the samples used while splitting the dataset into train/valid set. - **fitargs : Additional fitting arguments, the same accepted in Keras model.fit(...). - The validation set is automatically created accordingly the cv strategy. + Input data. Accepted types are arrays or list/dict in case of + multi-input/output. + + y : multi types, default=None + Target data. Accepted types are arrays or list/dict in case of + multi-input/output. + + sample_weight : multi types, default=None + Optional Numpy array of weights for the training samples, used + for weighting the loss function (during training only). Accepted + types are arrays or list/dict in case of multi-input/output. + + groups : array-like, default=None + Group labels for the samples used while splitting the dataset into + train/valid set. + + **fitargs : Additional fitting arguments, the same accepted in Keras + model.fit(...). + The validation set is automatically created accordingly to the + cv strategy. + + Returns + ------- + self : object """ - - if 'validation_split' in fitargs.keys() or 'validation_data' in fitargs.keys(): - raise ValueError("Validation is automatically created by the cv strategy") - - _check_data(x) - _check_data(y) - if sample_weight is not None: _check_data(sample_weight) - - for fold,(train_id,val_id) in enumerate(self.cv.split(x, y, groups)): - - if self.tuner_verbose == 1: - print("\n{}\n{} Fold {} {}\n{}".format( - '#'*18, '#'*3, str(fold+1).zfill(3), '#'*3, '#'*18)) - - if 'callbacks' in fitargs.keys() and fold == 0: - callback_paths = _get_callback_paths(fitargs['callbacks']) - - x_train = _create_fold(x, train_id) - y_train = _create_fold(y, train_id) - sample_weight_train = (_create_fold(sample_weight, train_id) if sample_weight - is not None else None) - - x_val = _create_fold(x, val_id) - y_val = _create_fold(y, val_id) - sample_weight_val = (_create_fold(sample_weight, val_id) if sample_weight - is not None else None) - - kgs_fold = KerasRandomSearch(hypermodel = self.hypermodel, - param_grid = self.param_grid, - n_iter = self.n_iter, - sampling_seed = self.sampling_seed, - monitor = self.monitor, - greater_is_better = self.greater_is_better, - store_model = self.store_model, - savepath = self.savepath, - tuner_verbose = self.tuner_verbose) - - kgs_fold._fold = fold+1 - if 'callbacks' in fitargs.keys(): - kgs_fold._callback_paths = callback_paths - - if hasattr(self, 'seed_fun'): - kgs_fold.set_seed(self.seed_fun, **self.seedargs) - - kgs_fold.search(x = x_train, - y = y_train, - sample_weight = sample_weight_train, - validation_data = (x_val, y_val, sample_weight_val), - **fitargs) - - self.folds_trials[f"fold {fold+1}"] = kgs_fold.trials - self.folds_scores[f"fold {fold+1}"] = kgs_fold.scores - self.folds_best_params[f"fold {fold+1}"] = kgs_fold.best_params - if self.store_model: - self.folds_best_models[f"fold {fold+1}"] = kgs_fold.best_model - self.folds_best_score[f"fold {fold+1}"] = kgs_fold.best_score - - eval_score = np.argmax if self.greater_is_better else np.argmin - mean_score_params = np.mean(list(self.folds_scores.values()), axis=0).round(5) - evaluate = eval_score(mean_score_params) - - self.best_params = [list(f)[evaluate] for f in self.folds_trials.values()] - self.best_params_score = mean_score_params[evaluate] \ No newline at end of file + + self._search(x=x, y=y, + sample_weight=sample_weight, + groups=groups, + is_random=True, + **fitargs) + + return self \ No newline at end of file diff --git a/kerashypetune/utils.py b/kerashypetune/utils.py index 1c40aff..b9eda1e 100644 --- a/kerashypetune/utils.py +++ b/kerashypetune/utils.py @@ -4,16 +4,14 @@ def _check_param(values): - - """ - Check the parameter boundaries passed in dict values. - + """Check the parameter boundaries passed in dict values. + Returns ------- list of checked parameters. """ - if isinstance(values, (list,tuple,np.ndarray)): + if isinstance(values, (list, tuple, np.ndarray)): return list(set(values)) elif hasattr(values, 'rvs'): return values @@ -21,169 +19,80 @@ def _check_param(values): return [values] -def _safeformat_str(str, **kwargs): - - """ - Safe naming formatting for 'trial' and 'fold' token. - - Returns - ------- - string filled correctly. - """ - - class SafeDict(dict): - def __missing__(self, key): - return '{' + key + '}' - - replacements = SafeDict(**kwargs) - - return str.format_map(replacements) - - -def _get_callback_paths(callbacks): - - """ - Extract the saving paths of Keras callbacks that allow the - possibility to create external files. - - Returns - ------- - list of extracted paths. - """ - - paths = [] - - if isinstance(callbacks, list): - for c in callbacks: - if hasattr(c, 'filepath'): - paths.append(c.filepath) - elif hasattr(c, 'log_dir'): - paths.append(c.log_dir) - elif hasattr(c, 'filename'): - paths.append(c.filename) - elif hasattr(c, 'path'): - paths.append(c.path) - elif hasattr(c, 'root'): - paths.append(c.root) - else: - paths.append(None) - else: - if hasattr(callbacks, 'filepath'): - paths.append(callbacks.filepath) - elif hasattr(callbacks, 'log_dir'): - paths.append(callbacks.log_dir) - elif hasattr(callbacks, 'filename'): - paths.append(callbacks.filename) - elif hasattr(callbacks, 'path'): - paths.append(callbacks.path) - elif hasattr(callbacks, 'root'): - paths.append(callbacks.root) - else: - paths.append(None) - - return paths - +def _clear_callbacks(callbacks, trial, fold): + """Assign the correct saving path to callbacks (if needed). -def _clear_callbacks(callbacks, paths, trial, fold, start_score): - - """ - Assign the correct saving path to callbacks (if needed) and - restore the starting score. - Returns ------- list of callbacks. """ - - if not isinstance(callbacks, list): - callbacks = [callbacks] - - for i,c in enumerate(callbacks): - if hasattr(c, 'filepath'): - c.filepath = _safeformat_str(paths[i], - trial=trial, fold=fold) - elif hasattr(c, 'log_dir'): - c.log_dir = _safeformat_str(paths[i], - trial=trial, fold=fold) - elif hasattr(c, 'filename'): - c.filename = _safeformat_str(paths[i], - trial=trial, fold=fold) - elif hasattr(c, 'path'): - c.path = _safeformat_str(paths[i], - trial=trial, fold=fold) - elif hasattr(c, 'root'): - c.root = _safeformat_str(paths[i], - trial=trial, fold=fold) - if hasattr(c, 'best'): - c.best = start_score + + file_paths = ['filepath', 'log_dir', 'filename', 'path', 'root'] + + for c in callbacks: + for f in file_paths: + if hasattr(c, f): + if fold is not None: + setattr(c, f, getattr(c, f).replace('{fold}', str(fold))) + setattr(c, f, getattr(c, f).replace('{trial}', str(trial))) return callbacks def _create_fold(X, ids): - - """ - Create folds from the data received. - + """Create folds from the data received. + Returns ------- - arrays/list or array/dict of arrays containing fold data. + array/list or arrays/dict of arrays containing fold data. """ - + if isinstance(X, list): return [x[ids] for x in X] - + elif isinstance(X, dict): - return {k:v[ids] for k,v in X.items()} - + return {k: v[ids] for k, v in X.items()} + else: return X[ids] - + def _check_data(X): - - """ - Data controls for cross validation. - """ - + """Data controls for cross validation.""" + if isinstance(X, list): for x in X: if not isinstance(x, np.ndarray): raise ValueError( - "Received data in list format. If you are dealing with " - "multi-input or multi-output model, take care to cast each " - "element of the list to numpy array. In case of single-input or " - "single-output, list are not supported: cast them to numpy array.") - + "Received data in list format. Take care to cast each " + "value of the list to numpy array.") + elif isinstance(X, dict): for x in X.values(): if not isinstance(x, np.ndarray): raise ValueError( "Received data in dict format. Take care to cast each " "value of the dict to numpy array.") - + elif isinstance(X, np.ndarray): pass - + else: raise ValueError( "Data format not appropriate for Keras CV search. " "Supported types are list, dict or numpy array.") - - -class ParameterSampler(object): +class ParameterSampler(object): # modified from scikit-learn ParameterSampler - """ - Generator on parameters sampled from given distributions. + """Generator on parameters sampled from given distributions. Non-deterministic iterable over random candidate combinations for hyper- parameter search. If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. It is highly recommended to use continuous distributions for continuous parameters. - + Parameters ---------- param_distributions : dict @@ -193,69 +102,75 @@ class ParameterSampler(object): If a list is given, it is sampled uniformly. If a list of dicts is given, first a dict is sampled uniformly, and then a parameter is sampled using that dict as above. - n_iter : integer + + n_iter : integer, default None Number of parameter settings that are produced. + random_state : int, default None Pass an int for reproducible output across multiple function calls. - + + is_random: bool, default True + If it's a random search. + Returns ------- param_combi : list of tuple list of sampled parameter combination """ - def __init__(self, param_distributions, n_iter, random_state=None): - - self.n_iter = n_iter - self.random_state = random_state - self.param_distributions = param_distributions + def __init__(self, param_distributions, n_iter=None, + random_state=None, is_random=False): - def __init__(self, param_distributions, n_iter, random_state=None): - self.n_iter = n_iter self.random_state = random_state self.param_distributions = param_distributions + self.is_random = is_random def sample(self): - - self.param_distributions = self.param_distributions.copy() - - for p_k, p_v in self.param_distributions.items(): - self.param_distributions[p_k] = _check_param(p_v) - - all_lists = all(not hasattr(p, "rvs") - for p in self.param_distributions.values()) - - seed = (random.randint(1, 100) if self.random_state is None - else self.random_state+1) + + param_distributions = self.param_distributions.copy() + + all_lists = all(not hasattr(p, "rvs") + for p in param_distributions.values()) + + seed = (random.randint(1, 100) if self.random_state is None + else self.random_state + 1) random.seed(seed) - + if all_lists: - param_combi = list(product(*self.param_distributions.values())) - grid_size = len(param_combi) + param_combi = list(product(*param_distributions.values())) - if grid_size < self.n_iter: - raise ValueError( - f"The total space of parameters {grid_size} is smaller " - f"than n_iter={self.n_iter}. Try with KerasGridSearch.") - param_combi = random.sample(param_combi, self.n_iter) + if self.is_random: + grid_size = len(param_combi) + if grid_size < self.n_iter: + raise ValueError( + "The total space of parameters {} is smaller " + "than n_iter={}. Try with KerasGridSearch.".format( + grid_size, self.n_iter)) + param_combi = random.sample(param_combi, self.n_iter) else: + + if self.n_iter is None: + raise ValueError( + "n_iter must be an integer >0 when parameter " + "distributions are provided. Get None.") + param_combi = [] k = self.n_iter for i in range(self.n_iter): - dist = self.param_distributions + dist = param_distributions.copy() params = [] - for j,v in enumerate(dist.values()): + for j, v in enumerate(dist.values()): if hasattr(v, "rvs"): - params.append(v.rvs(random_state=seed*(k+j))) + params.append(v.rvs(random_state=seed * (k + j))) else: - params.append(v[random.randint(0,len(v)-1)]) - k += i+j + params.append(v[random.randint(0, len(v) - 1)]) + k += i + j param_combi.append(tuple(params)) - + # reset seed np.random.mtrand._rand - + return param_combi \ No newline at end of file diff --git a/notebooks/Basic Usage GridSearch.ipynb b/notebooks/Basic Usage GridSearch.ipynb index acec838..5b4bfdb 100644 --- a/notebooks/Basic Usage GridSearch.ipynb +++ b/notebooks/Basic Usage GridSearch.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -47,7 +47,7 @@ "((6036, 28, 28), (6036,), (2963, 28, 28), (2963,))" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -172,6 +172,16 @@ "Epoch 00047: early stopping\n", "SCORE: 0.93993 at epoch 46\n" ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -188,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -197,7 +207,7 @@ "[0.95208, 0.95005, 0.9514, 0.95039, 0.95343, 0.94398, 0.9487, 0.93993]" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -280,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ diff --git a/setup.py b/setup.py index efac2f5..caca52c 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ HERE = pathlib.Path(__file__).parent -VERSION = '0.1.1' +VERSION = '0.1.2' PACKAGE_NAME = 'keras-hypetune' AUTHOR = 'Marco Cerliani' AUTHOR_EMAIL = 'cerlymarco@gmail.com' @@ -28,5 +28,6 @@ author_email=AUTHOR_EMAIL, url=URL, install_requires=INSTALL_REQUIRES, + python_requires='>=3', packages=find_packages() ) \ No newline at end of file