From 329deab1f109a2263fce006e94c9c28e0e9170e6 Mon Sep 17 00:00:00 2001 From: Nick Erickson Date: Tue, 14 Jan 2025 15:03:10 -0800 Subject: [PATCH] Major Refactor: Add save/load to dir, code refactor, etc. (#86) * adding test scripts * matching tabrepo and fit df, using zeroshot_context * plotting functionality * Update * WIP exec.py * Add updates * Add v2 scripts * Remove y_uncleaned * resolve merge conflicts * resolve merge conflicts * resolve merge conflicts * adding test scripts * plotting functionality * Initial Class implementation * typo * minor updates * add run_scripts_v4 * making run_experiment a staticmethod * Updated run_experiments * Cleanup, add TabPFNv2 prototype * Cleanup * Cleanup * Cleanup * Cleanup * Cleanup * bug fix * Add run_tabpfn_v2_benchmark.py + additional bugfixes * Add TabForestPFN_class.py * Add TabForestPFN_class.py * Delete old files * Update file locations * Add AutoGluon_class.py, tabforestpfn_model.py * add hyperparameter/init_args support * Add run_tabforestpfn_benchmark.py * removing unused files * Update add simulation_artifacts support * Add simulation ensemble comparison support via `evaluate_ensemble_with_time` * update * update * minor cleanup * minor cleanup * Update evaluate_ensemble_with_time * Fix bug in zeroshot_configs * Refactor baselines.py * Add repo.evaluate_ensemble_with_time_multi * Update repo.evaluate_ensemble to return DataFrame * Add logger module, and adding wrapper logs to run scripts, will add deeper level logs in next commit * minor update * Refactor evaluate_ensemble * Refactor evaluate_ensemble * Refactor evaluate_ensemble * Cleanup * Cleanup * Cleanup * Add logic to context.py * minor update * Add save/load logic to ZeroshotSimulatorContext * Add save/load logic to EvaluationRepository * Align column names in model fits * Add unit tests for repo save/load * Add extra unit tests for repo save/load * Fix Self import * Fix imports * fix tests * simplify run_quickstart_from_scratch.py * minor update * update `repo.from_raw` * Add root, app and console loggers * addition to logging module * add context save/load with json + relative path support * add ebm and tabpfnv2 models * add ebm and tabpfnv2 models * update * update * update * update * update * Support loading repo artifact from cloned directory * minor fix * cleanup * update * Update * cleanup * Add simple benchmark runner * cleanup * Update for ag12 * Update for ag12 * Update for ag12 * TabPFN support stopped at best epoch * update * update 2025 * update 2025 * update 2025 * update 2025 * update 2025 * update 2025 * update 2025 * update 2025 * update 2025 * Add docstrings for evaluate_ensemble and evaluate_ensembles * Add docstrings, code cleanup * delete old scripts * delete old scripts * update * remove scripts_v6 * remove scripts_v5 * remove context_dl.py * update plot_test_vs_val.py * remove experiment_utils_v6.py * cleanup * cleanup * bug fix * switch from np.bool8 to np.bool_ * Update scripts/baseline_comparison/baselines.py Co-authored-by: David Salinas * Update tabrepo/simulation/ensemble_selection_config_scorer.py Co-authored-by: David Salinas * Update tabrepo/simulation/ensemble_selection_config_scorer.py Co-authored-by: David Salinas * address comment --------- Co-authored-by: Ubuntu Co-authored-by: Ubuntu Co-authored-by: Prateek M Desai Co-authored-by: David Salinas --- README.md | 2 +- examples/run_quickstart.py | 4 +- examples/run_quickstart_from_scratch.py | 54 +--- scripts/baseline_comparison/baselines.py | 207 +++++------- scripts/baseline_comparison/evaluate_utils.py | 10 +- .../baseline_comparison/plot_test_vs_val.py | 163 ++++++++++ scripts/snippet.py | 2 +- tabrepo/contexts/context.py | 201 +++++++++--- tabrepo/contexts/context_artificial.py | 5 +- tabrepo/contexts/subcontext.py | 8 +- tabrepo/metrics/_fast_log_loss.py | 4 +- tabrepo/metrics/_roc_auc_cpp/__init__.py | 2 +- tabrepo/metrics/bench_utils.py | 2 +- tabrepo/portfolio/zeroshot_selection.py | 16 +- tabrepo/predictions/tabular_predictions.py | 11 +- tabrepo/repository/abstract_repository.py | 161 ++++++++- tabrepo/repository/ensemble_mixin.py | 305 +++++++++++++++--- tabrepo/repository/evaluation_repository.py | 146 ++++++++- tabrepo/repository/repo_utils.py | 40 +++ tabrepo/repository/time_utils.py | 39 +-- .../ensemble_selection_config_scorer.py | 133 +++++--- tabrepo/simulation/ground_truth.py | 10 +- tabrepo/simulation/simulation_context.py | 134 ++++++-- tabrepo/utils/cache.py | 22 +- tabrepo/utils/parallel_for.py | 9 +- tst/test_cache.py | 3 +- tst/test_metrics.py | 2 +- tst/test_repository.py | 187 +++++++++-- tst/test_repository_utils.py | 7 +- 29 files changed, 1462 insertions(+), 427 deletions(-) create mode 100644 scripts/baseline_comparison/plot_test_vs_val.py create mode 100644 tabrepo/repository/repo_utils.py diff --git a/README.md b/README.md index 48e38ce9..e7873137 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ To evaluate an ensemble of any list of configuration, you can run the following: ```python from tabrepo import load_repository repo = load_repository("D244_F3_C1530_30") -print(repo.evaluate_ensemble(datasets=["Australian"], configs=["CatBoost_r22_BAG_L1", "RandomForest_r12_BAG_L1"])) +print(repo.evaluate_ensemble(dataset="Australian", fold=0, configs=["CatBoost_r22_BAG_L1", "RandomForest_r12_BAG_L1"])) ``` this code will return the error of an ensemble whose weights are computed with the Caruana procedure after loading model diff --git a/examples/run_quickstart.py b/examples/run_quickstart.py index d11e0b2d..03bcb538 100644 --- a/examples/run_quickstart.py +++ b/examples/run_quickstart.py @@ -61,8 +61,8 @@ y_val = repo.labels_val(dataset=dataset, fold=0) print(f"Ground Truth Val (dataset={dataset}, fold=0):\n{y_val[:10]}") - df_ranks, df_ensemble_weights = repo.evaluate_ensemble(datasets=[dataset], configs=configs, ensemble_size=100) - print(f"Ensemble rank per task:\n{df_ranks}") + df_result, df_ensemble_weights = repo.evaluate_ensemble(dataset=dataset, fold=0, configs=configs, ensemble_size=100) + print(f"Ensemble result:\n{df_result}") df_ensemble_weights_mean_sorted = df_ensemble_weights.mean(axis=0).sort_values(ascending=False) print(f"Top 10 highest mean ensemble weight configs:\n{df_ensemble_weights_mean_sorted.head(10)}") diff --git a/examples/run_quickstart_from_scratch.py b/examples/run_quickstart_from_scratch.py index cdb425f0..1109e096 100644 --- a/examples/run_quickstart_from_scratch.py +++ b/examples/run_quickstart_from_scratch.py @@ -1,16 +1,9 @@ import pandas as pd -from autogluon.common.savers import save_pd -from autogluon.common.utils.simulation_utils import convert_simulation_artifacts_to_tabular_predictions_dict from autogluon.tabular import TabularPredictor from autogluon_benchmark import OpenMLTaskWrapper from tabrepo import EvaluationRepository -from tabrepo.repository import EvaluationRepositoryZeroshot -from tabrepo.predictions import TabularPredictionsInMemory -from tabrepo.contexts.context import BenchmarkContext, construct_context -from tabrepo.contexts.subcontext import BenchmarkSubcontext -from tabrepo.simulation.ground_truth import GroundTruth def get_artifacts(task: OpenMLTaskWrapper, fold: int, hyperparameters: dict, dataset: str = None, time_limit=60): @@ -125,51 +118,22 @@ def convert_leaderboard_to_configs(leaderboard: pd.DataFrame, minimal: bool = Tr ) ) - # TODO: Move into AutoGluonTaskWrapper - simulation_artifacts_full = dict() - leaderboards = [] - for simulation_artifacts, leaderboard in artifacts: - leaderboards.append(leaderboard) + results_lst_simulation_artifacts = [simulation_artifacts for simulation_artifacts, leaderboard in artifacts] + + leaderboards = [leaderboard for simulation_artifacts, leaderboard in artifacts] leaderboard_full = pd.concat(leaderboards) - print(leaderboard_full) - for simulation_artifacts, leaderboard in artifacts: - for k in simulation_artifacts.keys(): - if k not in simulation_artifacts_full: - simulation_artifacts_full[k] = {} - for f in simulation_artifacts[k]: - if f in simulation_artifacts_full: - raise AssertionError(f"Two results exist for tid {k}, fold {f}!") - else: - simulation_artifacts_full[k][f] = simulation_artifacts[k][f] - - zeroshot_pp, zeroshot_gt = convert_simulation_artifacts_to_tabular_predictions_dict(simulation_artifacts=simulation_artifacts_full) - - save_loc = "./quickstart/" - save_loc_data_dir = save_loc + "model_predictions/" - - predictions = TabularPredictionsInMemory.from_dict(zeroshot_pp) - ground_truth = GroundTruth.from_dict(zeroshot_gt) - predictions.to_data_dir(data_dir=save_loc_data_dir) - ground_truth.to_data_dir(data_dir=save_loc_data_dir) df_configs = convert_leaderboard_to_configs(leaderboard=leaderboard_full) - save_pd.save(path=f"{save_loc}configs.parquet", df=df_configs) + print(df_configs) - context: BenchmarkContext = construct_context( - name="quickstart", - datasets=datasets, - folds=folds, - local_prefix=save_loc, - local_prefix_is_relative=False, - has_baselines=False) - subcontext = BenchmarkSubcontext(parent=context) + repo = EvaluationRepository.from_raw(df_configs=df_configs, results_lst_simulation_artifacts=results_lst_simulation_artifacts) # Note: Can also skip all the above code if you want to use a readily available context rather than generating from scratch: - # from tabrepo.contexts import get_subcontext - # subcontext = get_subcontext(name="D244_F3_C1530_30") + # repo = EvaluationRepository.from_context(version="D244_F3_C1530_30", cache=True) + + repo.print_info() - repo: EvaluationRepository = subcontext.load_from_parent() - repo: EvaluationRepositoryZeroshot = repo.to_zeroshot() + repo = repo.to_zeroshot() results_cv = repo.simulate_zeroshot(num_zeroshot=3, n_splits=2, backend="seq") df_results = repo.generate_output_from_portfolio_cv(portfolio_cv=results_cv, name="quickstart") diff --git a/scripts/baseline_comparison/baselines.py b/scripts/baseline_comparison/baselines.py index a8b865e7..74d76023 100644 --- a/scripts/baseline_comparison/baselines.py +++ b/scripts/baseline_comparison/baselines.py @@ -1,7 +1,6 @@ -import ast import copy import itertools -from typing import List, Optional, Tuple +from typing import List import numpy as np from dataclasses import dataclass @@ -10,11 +9,6 @@ from tabrepo.portfolio.zeroshot_selection import zeroshot_configs from tabrepo.repository import EvaluationRepository -from tabrepo.repository.time_utils import ( - filter_configs_by_runtime, - sort_by_runtime, - get_runtime, -) from tabrepo.utils.parallel_for import parallel_for default_ensemble_size = 40 @@ -34,87 +28,67 @@ class ResultRow: normalized_error: float time_train_s: float time_infer_s: float + metric_error_val: float = None config_selected: list = None seed: int = None + metadata: dict = None def evaluate_configs( repo: EvaluationRepository, + configs: List[str], rank_scorer, normalized_scorer, tid: int, + folds: List[int], method: str, - config_selected: List[str], ensemble_size: int = default_ensemble_size, ensemble_kwargs=None, - config_sampled: List[str] = None, - folds: List[int] = range(10), + time_limit: float | None = None, + fit_order="original", seed: int = 0, ) -> List[ResultRow]: """ :param repo: + :param configs: :param rank_scorer: :param normalized_scorer: :param tid: :param method: :param ensemble_size: - :param config_selected: - :param config_sampled: the list of configurations that was seen, to count total runtime. Default to `config_selected` :param folds: :return: list of results for each fold in `folds` evaluated on task `tid` with `config_selected` configurations """ - if not config_sampled: - config_sampled = config_selected if ensemble_size is None: ensemble_size = default_ensemble_size - if ensemble_kwargs is None: - ensemble_kwargs = {} - # Makes results invariant to config order (otherwise tie breaking logic in Caruana selection can make the - # result depend on configuration order) - config_selected = list(sorted(config_selected.copy())) dataset = repo.tid_to_dataset(tid=tid) - metric_errors, ensemble_weights = repo.evaluate_ensemble( - datasets=[dataset], - configs=config_selected, - ensemble_size=ensemble_size, - ensemble_kwargs=ensemble_kwargs, - backend='native', - folds=folds, - rank=False, - ) - # we expect a tensor of results with shape (n_tasks, n_folds) - assert metric_errors.shape == (len(folds),) rows = [] for fold in folds: - task = repo.task_name(dataset=dataset, fold=fold) - metric_error = metric_errors.loc[(dataset, fold)] - config_weights = ensemble_weights.loc[(dataset, fold)] - - # select configurations used in the ensemble as infer time only depends on the models with non-zero weight. - config_selected_ensemble = [ - config - for config, weight in zip(config_selected, config_weights) - if weight != 0 - ] - runtimes = get_runtime( - repo=repo, - tid=tid, - fold=fold, - config_names=config_sampled, - runtime_col='time_train_s', - fail_if_missing=False, - ) - latencies = get_runtime( - repo=repo, - tid=tid, + df_metrics, ensemble_weights = repo.evaluate_ensemble( + dataset=dataset, fold=fold, - config_names=config_selected_ensemble, - runtime_col='time_infer_s', - fail_if_missing=False, + configs=configs, + fit_order=fit_order, + seed=seed, + ensemble_size=ensemble_size, + ensemble_kwargs=ensemble_kwargs, + time_limit=time_limit, + rank=False, ) + assert len(df_metrics) == 1 + metrics = df_metrics.iloc[0] + configs_selected = [c for c in configs if c in ensemble_weights.columns] + + task = repo.task_name(dataset=dataset, fold=fold) + + metric_error = metrics["metric_error"] + metric_error_val = metrics["metric_error_val"] + time_train_s = metrics["time_train_s"] + time_infer_s = metrics["time_infer_s"] + rows.append(ResultRow( dataset=dataset, fold=fold, @@ -122,22 +96,31 @@ def evaluate_configs( test_error=metric_error, rank=rank_scorer.rank(task, metric_error), normalized_error=normalized_scorer.rank(task, metric_error), - time_train_s=sum(runtimes.values()), - time_infer_s=sum(latencies.values()), - config_selected=config_sampled, + time_train_s=time_train_s, + time_infer_s=time_infer_s, + metric_error_val=metric_error_val, + config_selected=configs_selected, seed=seed, + metadata=dict( + n_iterations=ensemble_size, + time_limit=time_limit, + ) )) return rows -def framework_name(framework_type, max_runtime=None, ensemble_size=default_ensemble_size, tuned: bool=True) -> str: +def framework_name(framework_type, max_runtime=None, ensemble_size=default_ensemble_size, tuned: bool=True, all: bool = False, prefix: str = None) -> str: method = framework_type if framework_type else "All" + if prefix is None: + prefix = "" + if all: + method = "All" if not tuned: - return method + " (default)" + suffix = " (default)" else: suffix = " (tuned + ensemble)" if ensemble_size > 1 else " (tuned)" suffix += time_suffix(max_runtime=max_runtime) - method += suffix + method = f"{method}{prefix}{suffix}" return method @@ -159,7 +142,7 @@ def evaluate_tid(dataset_name, default, repo, rank_scorer, normalized_scorer): repo=repo, rank_scorer=rank_scorer, normalized_scorer=normalized_scorer, - config_selected=configs, + configs=configs, ensemble_size=ensemble_size, tid=repo.dataset_to_tid(dataset_name), folds=range(n_eval_folds), @@ -180,46 +163,6 @@ def evaluate_tid(dataset_name, default, repo, rank_scorer, normalized_scorer): return [x for l in list_rows for x in l] -def sample_and_pick_best( - repo: EvaluationRepository, tid: int, fold: int, framework_type: Optional[str], n_output: int, - max_runtime: float = None, random_state: int = 0, -) -> Tuple[List[str], List[str]]: - """ - :return: Samples random configurations for the given task until `max_runtime` is exhausted and returns the top `n_output` configurations - based on validation scores. If `framework_type` is specified then only configuration of this framework are considered. - Returns the configurations sampled and the configurations chosen. - """ - if n_output is None: - n_output = default_ensemble_size - df_score_val = repo._zeroshot_context.df_configs_ranked - - # gets rows with desired task and framework - mask = (df_score_val['tid'] == tid) & (df_score_val.fold == fold) - if framework_type: - mask &= (df_score_val.framework.str.contains(framework_type)) - df_sub = df_score_val[mask] - - if len(df_sub) == 0: - pass - # assert len(df_sub) > 0, f"missing data {tid} {framework_type}" - # print(f"missing data {tid} {fold} {framework_type}") - - # shuffle the rows - df_sub = df_sub.sample(frac=1, random_state=random_state).reset_index(drop=True) - - # pick only configurations up to max_runtime - if max_runtime: - df_sub = df_sub[df_sub.loc[:, "time_train_s"].cumsum() < max_runtime] - if len(df_sub) == 0: - return [backup_fast_config], [backup_fast_config] - - # pick top `n_output` configurations with the best validation loss - top_config_indices = df_sub["metric_error_val"].argsort().values[:n_output][::-1] - best_configs = df_sub.loc[top_config_indices, "framework"].tolist() - - return df_sub["framework"].tolist(), best_configs - - def framework_best_results( repo: EvaluationRepository, dataset_names: List[str], @@ -227,8 +170,10 @@ def framework_best_results( n_eval_folds: int, rank_scorer, normalized_scorer, + all: bool = False, max_runtimes: float = [3600], ensemble_size: int = default_ensemble_size, + method_prefix: str = None, engine: str = 'ray', random_state: int = 0, **kwargs) -> List[ResultRow]: @@ -237,41 +182,44 @@ def framework_best_results( configurations with highest validation scores among the `n_configs` configurations. """ - def evaluate_tid(dataset_name, max_runtime, framework_type, ensemble_size, repo, rank_scorer, normalized_scorer, random_state): + def evaluate_tid(dataset_name, max_runtime, framework_type, ensemble_size, repo, rank_scorer, normalized_scorer, random_state, all): tid = repo.dataset_to_tid(dataset_name) rows = [] for fold in range(n_eval_folds): - config_sampled, config_selected = sample_and_pick_best( - repo=repo, - n_output=ensemble_size, - tid=tid, - fold=fold, - framework_type=framework_type, - max_runtime=max_runtime, - random_state=random_state, - ) + df_score_val = repo._zeroshot_context.df_configs_ranked + + # gets rows with desired task and framework + mask = (df_score_val['dataset'] == dataset_name) & (df_score_val.fold == fold) + if framework_type: + if isinstance(framework_type, list): + mask &= (df_score_val.framework.str.contains('|'.join(framework_type))) + else: + mask &= (df_score_val.framework.str.contains(framework_type)) + df_sub = df_score_val[mask] + configs = df_sub["framework"].tolist() # evaluate them rows += evaluate_configs( repo=repo, rank_scorer=rank_scorer, normalized_scorer=normalized_scorer, - config_sampled=config_sampled, - config_selected=config_selected, + configs=configs, ensemble_size=ensemble_size, + time_limit=max_runtime, + fit_order="random", + seed=random_state, tid=tid, folds=[fold], - method=framework_name(framework_type, max_runtime, ensemble_size, tuned=True), + method=framework_name(framework_type, max_runtime, ensemble_size, tuned=True, all=all, prefix=method_prefix), ) - rows return rows ensemble_sizes = [1, ensemble_size] list_rows = parallel_for( evaluate_tid, inputs=list(itertools.product(dataset_names, max_runtimes, framework_types, ensemble_sizes)), - context=dict(repo=repo, rank_scorer=rank_scorer, normalized_scorer=normalized_scorer, random_state=random_state), + context=dict(repo=repo, rank_scorer=rank_scorer, normalized_scorer=normalized_scorer, random_state=random_state, all=all), engine=engine, ) return [x for l in list_rows for x in l] @@ -387,7 +335,7 @@ def zeroshot_results( seeds: list = [0], method_prefix: str = None, n_ensemble_in_name: bool = False, -) -> List[ResultRow]: +) -> list[ResultRow]: """ :param dataset_names: list of dataset to use when fitting zeroshot :param n_eval_folds: number of folds to consider for evaluation @@ -406,7 +354,7 @@ def zeroshot_results( def evaluate_dataset(test_dataset, n_portfolio, n_ensemble, n_training_dataset, n_training_fold, n_training_config, max_runtime, max_models, max_models_per_type, seed: int, repo: EvaluationRepository, df_rank, rank_scorer, normalized_scorer, - model_frameworks): + model_frameworks) -> list[ResultRow]: method_name = zeroshot_name( n_portfolio=n_portfolio, n_ensemble=n_ensemble, @@ -467,36 +415,33 @@ def evaluate_dataset(test_dataset, n_portfolio, n_ensemble, n_training_dataset, # current fold when filtering by runtime. # portfolio_configs = sort_by_runtime(repo=repo, config_names=portfolio_configs) - portfolio_configs = filter_configs_by_runtime( - repo=repo, - tid=test_tid, - fold=0, - config_names=portfolio_configs, - max_cumruntime=max_runtime if max_runtime else default_runtime, # TODO - ) - if len(portfolio_configs) == 0: - # in case all configurations selected were above the budget, we evaluate a quick backup, we pick a - # configuration that takes <1s to be evaluated - portfolio_configs = [backup_fast_config] + if max_runtime is None: + max_runtime = default_runtime ensemble_kwargs = { "max_models": max_models, "max_models_per_type": max_models_per_type, } - return evaluate_configs( + results_row_lst: list[ResultRow] = evaluate_configs( repo=repo, rank_scorer=rank_scorer, normalized_scorer=normalized_scorer, - config_selected=portfolio_configs, + configs=portfolio_configs, ensemble_size=n_ensemble, ensemble_kwargs=ensemble_kwargs, tid=test_tid, + time_limit=max_runtime, method=method_name, folds=range(n_eval_folds), seed=seed, ) + for results_row in results_row_lst: + results_row.metadata["n_portfolio"] = n_portfolio + + return results_row_lst + dd = repo._zeroshot_context.df_configs_ranked # df_rank = dd.pivot_table(index="framework", columns="dataset", values="score_val").rank() # TODO use normalized scores diff --git a/scripts/baseline_comparison/evaluate_utils.py b/scripts/baseline_comparison/evaluate_utils.py index 55c3fe48..10d83d60 100644 --- a/scripts/baseline_comparison/evaluate_utils.py +++ b/scripts/baseline_comparison/evaluate_utils.py @@ -27,11 +27,15 @@ class Experiment: expname: str # name of the parent experiment used to store the file name: str # name of the specific experiment, e.g. "localsearch" - run_fun: Callable[[], List[ResultRow]] # function to execute to obtain results + run_fun: Callable[..., List[ResultRow]] # function to execute to obtain results + kwargs: dict = None - def data(self, ignore_cache: bool = False): + def data(self, ignore_cache: bool = False) -> pd.DataFrame: + kwargs = self.kwargs + if kwargs is None: + kwargs = {} return cache_function_dataframe( - lambda: pd.DataFrame(self.run_fun()), + lambda: pd.DataFrame(self.run_fun(**kwargs)), cache_name=self.name, ignore_cache=ignore_cache, cache_path=output_path.parent / "data" / "results-baseline-comparison" / self.expname, diff --git a/scripts/baseline_comparison/plot_test_vs_val.py b/scripts/baseline_comparison/plot_test_vs_val.py new file mode 100644 index 00000000..4315f81b --- /dev/null +++ b/scripts/baseline_comparison/plot_test_vs_val.py @@ -0,0 +1,163 @@ +import ast +import math +import matplotlib.pyplot as plt +import pandas as pd +from matplotlib.animation import ArtistAnimation + +from tabrepo.repository.abstract_repository import AbstractRepository + + +def plot_test_vs_val(df: pd.DataFrame, repo: AbstractRepository, baselines: list[str] = None): + budget_suffix = f"\(4h\)" + df_selected = df[ + df.method.str.contains(f"Portfolio-N.* \(ensemble\) {budget_suffix}") + ].copy() + + df_expanded_metadata = pd.DataFrame(df_selected["metadata"].map(ast.literal_eval).to_list(), index=df_selected.index) + df_selected = pd.concat([df_selected, df_expanded_metadata], axis=1) + + df_selected = df_selected.sort_values(by="n_portfolio") + df_selected_first = df_selected[df_selected["n_portfolio"] == 1] + df_selected_first["norm_factor"] = df_selected_first["test_error"] + df_selected_first = df_selected_first[["dataset", "fold", "norm_factor"]] + df_selected = df_selected.merge(df_selected_first, on=["dataset", "fold"]) + + df_selected = df_selected[df_selected["norm_factor"] != 0] + df_selected["metric_error_rescaled"] = df_selected["test_error"] / df_selected["norm_factor"] + df_selected["metric_error_val_rescaled"] = df_selected["metric_error_val"] / df_selected["norm_factor"] + + plot_w(df=df_selected) + + task_metadata = repo.task_metadata.copy() + task_metadata = task_metadata[task_metadata["dataset"].isin(df_selected["dataset"].unique())] + task_metadata = task_metadata[["dataset", "NumberOfInstances"]].set_index("dataset") + task_metadata = task_metadata.sort_values(by=["NumberOfInstances"]) + + baselines = ["AutoGluon best (4h)", "Autosklearn2 (4h)"] + if baselines is not None: + df_baselines = df[df["method"].isin(baselines)].copy() + df_baselines = df_baselines.merge(df_selected_first, on=["dataset", "fold"]) + df_baselines = df_baselines[df_baselines["norm_factor"] != 0] + df_baselines["metric_error_rescaled"] = df_baselines["test_error"] / df_baselines["norm_factor"] + # df_baselines["metric_error_val_rescaled"] = df_baselines["metric_error_val"] / df_baselines["norm_factor"] + else: + df_baselines = None + + sliding_window_size = 50 + + n_datasets = len(task_metadata) + + fig, ax = plt.subplots(figsize=(8, 6)) + artists = [] + for i in range(n_datasets + sliding_window_size - 1): + # for i in range(5): + i_min = max(0, i - sliding_window_size + 1) + i_max = min(n_datasets-1, i) + + cur_datasets = task_metadata.iloc[i_min:i_max+1] + cur_datasets_names = cur_datasets.index + print(i_min, i_max, n_datasets, i_max - i_min + 1, len(cur_datasets)) + samples_min = cur_datasets["NumberOfInstances"].min() + samples_max = cur_datasets["NumberOfInstances"].max() + text = f"Dataset #{i_min+1} - #{i_max+1} | # Rows: {samples_min} - {samples_max} | Window Size: {i_max - i_min + 1}" + + if df_baselines is not None: + df_baselines_cur = df_baselines[df_baselines["dataset"].isin(cur_datasets_names)] + else: + df_baselines_cur = None + + df_selected_cur = df_selected[df_selected["dataset"].isin(cur_datasets_names)] + if i == 0: + update_ax = True + else: + update_ax = False + + artists_subplot = plot_w(df=df_selected_cur, name=f"my_fig_i{i}.png", ax=ax, update_ax=update_ax, text=text, df_baselines=df_baselines_cur) + artists.append(artists_subplot) + ani = ArtistAnimation(fig=fig, artists=artists, interval=200, blit=True) + # ani.save('animation.html', writer='html') + ani.save("animation.gif") + + from autogluon.common.utils.s3_utils import upload_file + upload_file(file_name="animation.gif", bucket="autogluon-zeroshot") + + +def plot_w(df: pd.DataFrame, name: str = "test_vs_val.png", ax=None, update_ax: bool = False, text: str = None, df_baselines: pd.DataFrame = None): + quantile_levels_og = [0.75] + quantile_levels = [0.5] + quantile_levels_og + [1 - q for q in quantile_levels_og] + minimal_data_quantile = df.groupby("n_portfolio")[["metric_error_rescaled", "metric_error_val_rescaled"]].quantile(q=quantile_levels) + minimal_data = df.groupby("n_portfolio")[["metric_error_rescaled", "metric_error_val_rescaled"]].median().reset_index() + + results = minimal_data + + val_losses = results["metric_error_val_rescaled"] + test_losses = results["metric_error_rescaled"] + models = results["n_portfolio"] + + colors_baselines = ['#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] + + save_fig = ax is None + + ymin = -200 + ymax = 0 + + if ax is None: + fig, ax = plt.subplots(figsize=(8, 6)) + + artists = [] + for q in quantile_levels: + results_q = minimal_data_quantile.loc[:, q, :] + val_losses_q = results_q["metric_error_val_rescaled"] + test_losses_q = results_q["metric_error_rescaled"] + models = results_q.index + + alpha = math.pow((1 - abs(q - 0.5)), 5) + if q != 0.5: + line_val = ax.fill_betweenx([-m for m in models], val_losses_q, val_losses, alpha=alpha, color='#1f77b4') + line_test = ax.fill_betweenx([-m for m in models], test_losses_q, test_losses, alpha=alpha, color='#ff7f0e') + artists += [line_val, line_test] + else: + line_val, = ax.plot(val_losses_q, [-m for m in models], marker='o', linestyle="--", label=f"Val loss ({q})", alpha=alpha, color='#1f77b4') + line_test, = ax.plot(test_losses_q, [-m for m in models], marker='o', linestyle="--", label=f"Test loss ({q})", alpha=alpha, color='#ff7f0e') + artists += [line_val, line_test] + + if df_baselines is not None: + minimal_data_quantile = df_baselines.groupby("method")[["metric_error_rescaled"]].quantile(q=quantile_levels) + minimal_data = df_baselines.groupby("method")[["metric_error_rescaled"]].median() + + test_losses = minimal_data["metric_error_rescaled"] + + baselines = list(df_baselines["method"].unique()) + + for q in quantile_levels: + results_q = minimal_data_quantile.loc[:, q, :] + test_losses_q = results_q["metric_error_rescaled"] + alpha = math.pow((1 - abs(q - 0.5)), 5) + for baseline, colors in zip(baselines, colors_baselines[:len(baselines)]): + test_losses_q_baseline = test_losses_q.loc[baseline] + if q != 0.5: + pass + # line_test = ax.fill_betweenx([ymin, ymax], test_losses_q_baseline, test_losses.loc[baseline], alpha=alpha*0.4, color=colors) + # artists += [line_test] + else: + line_test = ax.vlines(x=test_losses_q_baseline, ymin=ymin, ymax=ymax, linestyles="solid", alpha=0.4, label=baseline, colors=colors, linewidth=1.5) + artists += [line_test] + + if text is not None: + text = ax.text(0.5, 1.01, text, fontsize="large") + artists.append(text) + + if update_ax: + ax.legend() + ax.set_xlabel("Relative Loss") + ax.set_ylabel("Portfolio Size") + ax.set_yticks([-m for m in models]) + ax.set_yticklabels(models) + ax.grid(True) + ax.set_xlim(0.5, 1.1) + ax.set_ylim(ymin, ymax) + + if save_fig: + plt.savefig(name) + # plt.show() + return artists diff --git a/scripts/snippet.py b/scripts/snippet.py index 68cd6f72..5bd5d456 100644 --- a/scripts/snippet.py +++ b/scripts/snippet.py @@ -4,7 +4,7 @@ repository = load_repository("D244_F3_C1416_200") # returns in ~2s the tensor of metrics for each dataset/fold obtained after ensembling the given configurations -metrics = repository.evaluate_ensemble( +metrics = repository.evaluate_ensembles( datasets=["abalone", "adult"], # OpenML dataset to report results on folds=[0, 1, 2], # which task to consider for each dataset configs=["CatBoost_r42_BAG_L1", "RandomForest_r12_BAG_L1", "NeuralNetTorch_r40_BAG_L1"], # configs that are ensembled diff --git a/tabrepo/contexts/context.py b/tabrepo/contexts/context.py index df533338..454bf76c 100644 --- a/tabrepo/contexts/context.py +++ b/tabrepo/contexts/context.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import asdict, dataclass from typing import Dict, List, Tuple import os from pathlib import Path @@ -8,8 +8,10 @@ import boto3 from botocore.errorfactory import ClientError import pandas as pd +from typing_extensions import Self -from autogluon.common.loaders import load_pd +from autogluon.common.loaders import load_pd, load_json +from autogluon.common.savers import save_json from autogluon.common.utils.s3_utils import download_s3_files from autogluon.common.utils.s3_utils import is_s3_url, s3_path_to_bucket_prefix @@ -18,6 +20,7 @@ from ..simulation.ground_truth import GroundTruth from ..simulation.simulation_context import ZeroshotSimulatorContext from ..predictions.tabular_predictions import TabularModelPredictions +from ..repository.evaluation_repository import EvaluationRepository from ..utils import catchtime from ..utils.download import download_files @@ -33,6 +36,7 @@ class BenchmarkPaths: zs_pp: List[str] = None zs_gt: List[str] = None configs_hyperparameters: List[str] = None + relative_path: str = None def __post_init__(self): if self.zs_pp is not None and isinstance(self.zs_pp, str): @@ -42,6 +46,48 @@ def __post_init__(self): if self.configs_hyperparameters is not None and isinstance(self.configs_hyperparameters, str): self.configs_hyperparameters = [self.configs_hyperparameters] + @property + def configs_full(self): + return self._to_full(self.configs) + + @property + def baselines_full(self): + return self._to_full(self.baselines) + + @property + def zs_pp_full(self): + return self._to_full_lst(self.zs_pp) + + @property + def zs_gt_full(self): + return self._to_full_lst(self.zs_gt) + + @property + def configs_hyperparameters_full(self): + return self._to_full_lst(self.configs_hyperparameters) + + @property + def task_metadata_full(self): + return self._to_full(self.task_metadata) + + @property + def path_pred_proba_full(self): + return self._to_full(self.path_pred_proba) + + def _to_full(self, path: str) -> str | None: + if self.relative_path is None: + return path + if path is None: + return None + return str(Path(self.relative_path) / path) + + def _to_full_lst(self, paths: list[str] | None) -> list[str] | None: + if self.relative_path is None: + return paths + if paths is None: + return None + return [self._to_full(path) for path in paths] + def print_summary(self): max_str_len = max(len(key) for key in self.__dict__.keys()) print(f'BenchmarkPaths Summary:') @@ -49,28 +95,28 @@ def print_summary(self): def get_file_paths(self, include_zs: bool = True) -> List[str]: file_paths = [ - self.configs, - self.baselines, - self.task_metadata, + self.configs_full, + self.baselines_full, + self.task_metadata_full, ] if include_zs: - file_paths += self.zs_pp - file_paths += self.zs_gt + file_paths += self.zs_pp_full + file_paths += self.zs_gt_full file_paths = [f for f in file_paths if f is not None] return file_paths def assert_exists_all(self, check_zs=True): - self._assert_exists(self.configs, 'configs') + self._assert_exists(self.configs_full, 'configs') if self.baselines is not None: - self._assert_exists(self.baselines, 'baselines') + self._assert_exists(self.baselines_full, 'baselines') if self.task_metadata is not None: - self._assert_exists(self.task_metadata, 'task_metadata') + self._assert_exists(self.task_metadata_full, 'task_metadata') if check_zs: if self.zs_pp is not None: - for f in self.zs_pp: + for f in self.zs_pp_full: self._assert_exists(f, f'zs_pp | {f}') if self.zs_gt is not None: - for f in self.zs_gt: + for f in self.zs_gt_full: self._assert_exists(f, f'zs_gt | {f}') @staticmethod @@ -102,6 +148,14 @@ def exists_all(self, check_zs: bool = True) -> bool: required_files = self.get_file_paths(include_zs=check_zs) return all(self.exists(f) for f in required_files) + def missing_files(self, check_zs: bool = True) -> list: + required_files = self.get_file_paths(include_zs=check_zs) + missing_files = [] + for f in required_files: + if not self.exists(f): + missing_files.append(f) + return missing_files + @staticmethod def exists(filepath: str) -> bool: if filepath is None: @@ -122,8 +176,8 @@ def exists(filepath: str) -> bool: def load_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]: df_configs, df_metadata = load_results( - path_configs=self.configs, - path_metadata=self.task_metadata, + path_configs=self.configs_full, + path_metadata=self.task_metadata_full, metadata_join_column=self.metadata_join_column, require_tid_in_metadata=self.task_metadata is not None, ) @@ -132,7 +186,7 @@ def load_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]: def load_baselines(self) -> pd.DataFrame | None: if self.baselines is None: return None - df_baselines = load_pd.load(self.baselines) + df_baselines = load_pd.load(self.baselines_full) return df_baselines def load_predictions(self, @@ -145,13 +199,13 @@ def load_predictions(self, "memopt": Very fast and high memory usage. "mem": Slow and high memory usage, simplest format to debug. """ - for f in self.zs_pp: + for f in self.zs_pp_full: self._assert_exists(f, f'zs_pp | {f}') - for f in self.zs_gt: + for f in self.zs_gt_full: self._assert_exists(f, name=f'zs_gt | {f}') zeroshot_pred_proba, zeroshot_gt, zsc = load_zeroshot_input( - path_pred_proba=self.path_pred_proba, - paths_gt=self.zs_gt, + path_pred_proba=self.path_pred_proba_full, + paths_gt=self.zs_gt_full, zsc=zsc, datasets=self.datasets, prediction_format=prediction_format, @@ -159,7 +213,10 @@ def load_predictions(self, return zeroshot_pred_proba, zeroshot_gt, zsc def load_configs_hyperparameters(self) -> dict: - return load_configs(self.configs_hyperparameters) + return load_configs(self.configs_hyperparameters_full) + + def to_dict(self) -> dict: + return asdict(self) class BenchmarkContext: @@ -171,6 +228,7 @@ def __init__(self, description: str = None, date: str = None, s3_download_map: Dict[str, str] = None, + config_fallback: str = None, ): self.folds = folds self.benchmark_paths = benchmark_paths @@ -178,6 +236,7 @@ def __init__(self, self.description = description self.date = date self.s3_download_map = s3_download_map + self.config_fallback = config_fallback @classmethod def from_paths(cls, @@ -187,12 +246,14 @@ def from_paths(cls, description: str = None, date: str = None, s3_download_map: Dict[str, str] = None, + config_fallback: str = None, **paths): return cls(folds=folds, name=name, description=description, date=date, s3_download_map=s3_download_map, + config_fallback=config_fallback, benchmark_paths=BenchmarkPaths(**paths)) def download(self, @@ -330,6 +391,11 @@ def load(self, download_files = False if download_files: self.benchmark_paths.print_summary() + if self.s3_download_map is None: + missing_files = self.benchmark_paths.missing_files() + if missing_files: + missing_files_str = [f'\n\t"{m}"' for m in missing_files] + raise FileNotFoundError(f'Missing {len(missing_files)} required files: \n[{",".join(missing_files_str)}\n]') print(f'Downloading input files from s3...') self.download(include_zs=load_predictions, exists=exists) self.benchmark_paths.assert_exists_all(check_zs=load_predictions) @@ -346,6 +412,29 @@ def load(self, return zsc, zeroshot_pred_proba, zeroshot_gt + def load_repo( + self, + folds: List[int] = None, + load_predictions: bool = True, + download_files: bool = True, + prediction_format: str = "memmap", + exists: str = 'ignore', + ) -> EvaluationRepository: + zsc, zeroshot_pred_proba, zeroshot_gt = self.load( + folds=folds, + load_predictions=load_predictions, + download_files=download_files, + prediction_format=prediction_format, + exists=exists, + ) + repo = EvaluationRepository( + zeroshot_context=zsc, + tabular_predictions=zeroshot_pred_proba, + ground_truth=zeroshot_gt, + config_fallback=self.config_fallback, + ) + return repo + def _load_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]: df_configs, df_metadata = self.benchmark_paths.load_results() return df_configs, df_metadata @@ -377,6 +466,24 @@ def _load_zsc(self, folds: List[int], configs_hyperparameters: dict) -> Zeroshot ) return zsc + def to_json(self, path: str): + output = { + "name": self.name, + "date": self.date, + "description": self.description, + "folds": self.folds, + "s3_download_map": self.s3_download_map, + "config_fallback": self.config_fallback, + "benchmark_paths": self.benchmark_paths.to_dict() + } + save_json.save(path=path, obj=output) + + @classmethod + def from_json(cls, path: str) -> Self: + kwargs = load_json.load(path) + kwargs["benchmark_paths"] = BenchmarkPaths(**kwargs["benchmark_paths"]) + return cls(**kwargs) + def construct_s3_download_map( s3_prefix: str, @@ -401,9 +508,9 @@ def construct_s3_download_map( def construct_context( - name: str, - datasets: List[str], - folds: List[int], + name: str | None, + datasets: list[str], + folds: list[int], local_prefix: str, s3_prefix: str = None, description: str = None, @@ -412,7 +519,9 @@ def construct_context( local_prefix_is_relative: bool = True, has_baselines: bool = True, metadata_join_column: str = "dataset", - configs_hyperparameters: List[str] = None, + configs_hyperparameters: list[str] = None, + is_relative: bool = False, + config_fallback: str = None, ) -> BenchmarkContext: """ @@ -463,26 +572,44 @@ def construct_context( else: _s3_download_map = None - zs_pp = [f"{split_key}{f}" for f in _files_pp] - zs_pp = [Paths.rel_to_abs(k, relative_to=data_root) for k in zs_pp] - - zs_gt = [f"{split_key}{f}" for f in _files_gt] - zs_gt = [Paths.rel_to_abs(k, relative_to=data_root) for k in zs_gt] + if is_relative: + zs_pp = [str(Path("model_predictions") / f) for f in _files_pp] + zs_gt = [str(Path("model_predictions") / f) for f in _files_gt] + else: + zs_pp = [f"{split_key}{f}" for f in _files_pp] + zs_pp = [Paths.rel_to_abs(k, relative_to=data_root) for k in zs_pp] + zs_gt = [f"{split_key}{f}" for f in _files_gt] + zs_gt = [Paths.rel_to_abs(k, relative_to=data_root) for k in zs_gt] - _result_paths = dict( - configs=str(Path(path_context) / "configs.parquet"), - ) + if is_relative: + _result_paths = dict(configs="configs.parquet") + else: + _result_paths = dict( + configs=str(Path(path_context) / "configs.parquet"), + ) if has_baselines: - _result_paths["baselines"] = str(Path(path_context) / "baselines.parquet") + if is_relative: + _result_paths["baselines"] = "baselines.parquet" + else: + _result_paths["baselines"] = str(Path(path_context) / "baselines.parquet") if task_metadata is not None: - _task_metadata_path = dict( - task_metadata=str(data_root / "metadata" / task_metadata), - ) + if is_relative: + _task_metadata_path = dict(task_metadata=task_metadata) + else: + _task_metadata_path = dict(task_metadata=str(Path(path_context) / task_metadata)) else: _task_metadata_path = dict() + if is_relative: + split_key = str(Path("model_predictions")) + os.path.sep + + if is_relative: + relative_path = str(Path(path_context)) + else: + relative_path = None + _bag_zs_path = dict( zs_gt=zs_gt, zs_pp=zs_pp, @@ -501,6 +628,8 @@ def construct_context( s3_download_map=_s3_download_map, datasets=datasets, metadata_join_column=metadata_join_column, + relative_path=relative_path, + config_fallback=config_fallback, **_result_paths, **_bag_zs_path, **_task_metadata_path, diff --git a/tabrepo/contexts/context_artificial.py b/tabrepo/contexts/context_artificial.py index 9b752207..090f9961 100644 --- a/tabrepo/contexts/context_artificial.py +++ b/tabrepo/contexts/context_artificial.py @@ -24,6 +24,7 @@ def load_context_artificial( problem_type: str = "regression", seed=0, include_hyperparameters: bool = False, + dtype=np.float32, **kwargs, ): # TODO write specification of dataframes schema, this code produces a minimal example that enables @@ -88,11 +89,11 @@ def load_context_artificial( dataset_name: { fold: { "pred_proba_dict_val": { - m: rng.random((123, n_classes)) if n_classes > 2 else rng.random(123) + m: rng.random((123, n_classes), dtype=dtype) if n_classes > 2 else rng.random(123, dtype=dtype) for m in models }, "pred_proba_dict_test": { - m: rng.random((13, n_classes)) if n_classes > 2 else rng.random(13) + m: rng.random((13, n_classes), dtype=dtype) if n_classes > 2 else rng.random(13, dtype=dtype) for m in models } } diff --git a/tabrepo/contexts/subcontext.py b/tabrepo/contexts/subcontext.py index f2588208..475658d2 100644 --- a/tabrepo/contexts/subcontext.py +++ b/tabrepo/contexts/subcontext.py @@ -64,13 +64,7 @@ def _cache(self, **kwargs) -> EvaluationRepository: def load_from_parent(self, **kwargs) -> EvaluationRepository: # TODO: Consider adding configs_full to Repo - zsc, zeroshot_pred_proba, zeroshot_gt = self.parent.load(**kwargs) - - repo = EvaluationRepository( - zeroshot_context=zsc, - tabular_predictions=zeroshot_pred_proba, - ground_truth=zeroshot_gt, - ) + repo = self.parent.load_repo(**kwargs) repo = repo.subset( datasets=self.datasets, folds=self.folds, diff --git a/tabrepo/metrics/_fast_log_loss.py b/tabrepo/metrics/_fast_log_loss.py index 0a6eadae..385bee13 100644 --- a/tabrepo/metrics/_fast_log_loss.py +++ b/tabrepo/metrics/_fast_log_loss.py @@ -19,7 +19,7 @@ def extract_true_class_prob(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarra assert len(y_true) == len(y_pred) if y_pred.ndim == 1: - y_true_bool = y_true.astype(np.bool8) + y_true_bool = y_true.astype(np.bool_) true_class_prob = y_true_bool*y_pred + ~y_true_bool*(1-y_pred) else: assert y_pred.ndim == 2 @@ -49,7 +49,7 @@ def extract_true_class_prob_bulk(y_true: np.ndarray, y_pred_bulk: np.ndarray) -> assert y_pred_bulk.shape[1] == len(y_true), f"y_true and y_pred_bulk have different numbers of samples! " \ f"({len(y_true)}, {y_pred_bulk.shape[1]})" if ndim == 2: - y_true_bool = y_true.astype(np.bool8) + y_true_bool = y_true.astype(np.bool_) true_class_prob_bulk = y_true_bool * y_pred_bulk + ~y_true_bool * (1 - y_pred_bulk) else: # ndim == 3 true_class_prob_bulk = y_pred_bulk[:, range(y_pred_bulk.shape[1]), y_true] diff --git a/tabrepo/metrics/_roc_auc_cpp/__init__.py b/tabrepo/metrics/_roc_auc_cpp/__init__.py index a7bf336a..233dadee 100644 --- a/tabrepo/metrics/_roc_auc_cpp/__init__.py +++ b/tabrepo/metrics/_roc_auc_cpp/__init__.py @@ -29,7 +29,7 @@ def __init__(self): def roc_auc_score(self, y_true: np.array, y_score: np.array) -> float: """a method to calculate AUC via C++ lib. Args: - y_true (np.array): 1D numpy array of dtype=np.bool8 as true labels. + y_true (np.array): 1D numpy array of dtype=np.bool_ as true labels. y_score (np.array): 1D numpy array of dtype=np.float32 as probability predictions. Returns: float: AUC score diff --git a/tabrepo/metrics/bench_utils.py b/tabrepo/metrics/bench_utils.py index 8e17d825..c9518d14 100644 --- a/tabrepo/metrics/bench_utils.py +++ b/tabrepo/metrics/bench_utils.py @@ -8,7 +8,7 @@ def generate_y_true_and_y_pred_binary(num_samples, random_seed=0): np.random.seed(seed=random_seed) - y_true = np.random.randint(0, 2, num_samples).astype(np.bool8) + y_true = np.random.randint(0, 2, num_samples).astype(np.bool_) y_pred = np.random.rand(num_samples).astype(np.float32) return y_true, y_pred diff --git a/tabrepo/portfolio/zeroshot_selection.py b/tabrepo/portfolio/zeroshot_selection.py index 3ad54a6b..49f795bf 100644 --- a/tabrepo/portfolio/zeroshot_selection.py +++ b/tabrepo/portfolio/zeroshot_selection.py @@ -14,14 +14,24 @@ def zeroshot_configs(val_scores: np.array, output_size: int) -> List[int]: df_val_scores = pd.DataFrame(val_scores) ranks = pd.DataFrame(df_val_scores).rank(axis=1) res = [] + best_mean = None for _ in range(output_size): # Select greedy-best configuration considering all others - best_idx = ranks.mean(axis=0).idxmin() + if ranks.empty: + # Nothing more to add + break + + cur_ranks_mean = ranks.mean(axis=0) + best_idx = cur_ranks_mean.idxmin() + cur_best_mean = cur_ranks_mean[best_idx] + + if best_mean is not None and cur_best_mean == best_mean: + # No improvement + break + best_mean = cur_best_mean # Update ranks for choosing each configuration considering the previously chosen ones ranks.clip(upper=ranks[best_idx], axis=0, inplace=True) - if best_idx not in df_val_scores: - break # Drop the chosen configuration as a future candidate df_val_scores.drop(columns=best_idx, inplace=True) res.append(best_idx) diff --git a/tabrepo/predictions/tabular_predictions.py b/tabrepo/predictions/tabular_predictions.py index ef17bf10..b6183129 100644 --- a/tabrepo/predictions/tabular_predictions.py +++ b/tabrepo/predictions/tabular_predictions.py @@ -235,11 +235,15 @@ def from_data_dir(cls, data_dir: Union[str, Path], datasets: Optional[List[str]] return cls.from_dict(pred_dict=memmap.to_dict(), datasets=datasets) def predict_val(self, dataset: str, fold: int, models: List[str] = None, model_fallback: str = None) -> np.array: - assert model_fallback is None, "model_fallback not supported for in memory data-structure" + assert model_fallback is None, ("config_fallback not supported for in memory data-structure, " + "try saving repo via `repo.to_dir(path)`, " + "then calling `repo = EvaluationRepository.from_dir(path, prediction_format='memmap')` to enable config_fallback") return self._load_pred(dataset=dataset, fold=fold, models=models, split="val") def predict_test(self, dataset: str, fold: int, models: List[str] = None, model_fallback: str = None) -> np.array: - assert model_fallback is None, "model_fallback not supported for in memory data-structure" + assert model_fallback is None, ("config_fallback not supported for in memory data-structure, " + "try saving repo via `repo.to_dir(path)`, " + "then calling `repo = EvaluationRepository.from_dir(path, prediction_format='memmap')` to enable config_fallback") return self._load_pred(dataset=dataset, fold=fold, models=models, split="test") def _load_pred(self, dataset: str, split: str, fold: int, models: List[str] = None): @@ -359,9 +363,6 @@ def _load_pred(self, dataset: str, split: str, fold: int, models: List[str] = No metadata = self.metadata_dict[dataset][fold] model_indices_all = metadata["model_indices"] model_indices_available = {m: model_indices_all[m] for m in metadata['models']} - if model_fallback is None: - # FIXME HACK - model_fallback = "ExtraTrees_c1_BAG_L1" if model_fallback: # we use the model fallback if a model is not present models = [m if m in model_indices_available else model_fallback for m in models] diff --git a/tabrepo/repository/abstract_repository.py b/tabrepo/repository/abstract_repository.py index 4a74ae98..9fbcc574 100644 --- a/tabrepo/repository/abstract_repository.py +++ b/tabrepo/repository/abstract_repository.py @@ -2,16 +2,20 @@ import copy from abc import ABC, abstractmethod +from pathlib import Path from typing import Dict, List import numpy as np import pandas as pd from typing_extensions import Self +from .repo_utils import convert_time_infer_s_from_sample_to_batch from ..simulation.simulation_context import ZeroshotSimulatorContext from ..simulation.single_best_config_scorer import SingleBestConfigScorer from ..utils.cache import SaveLoadMixin +from autogluon.common.savers import save_pd + class AbstractRepository(ABC, SaveLoadMixin): def __init__( @@ -145,17 +149,19 @@ def datasets(self, problem_type: str = None, union: bool = True) -> List[str]: """ return self._zeroshot_context.get_datasets(problem_type=problem_type, union=union) - def configs(self, *, datasets: List[str] = None, tasks: List[str] = None, union: bool = True) -> List[str]: + def configs(self, *, datasets: list[str] = None, tasks: list[tuple[str, int]] = None, union: bool = True) -> list[str]: """ Return all valid configs. By default, will return all configs that appear in any task at least once. Parameters ---------- - datasets : List[str], default = None - If specified, will only consider the configs present in the given datasets - tasks: List[str], default = None - If specified, will only consider the configs present in the given tasks + datasets : list[str], default = None + If specified, will only consider the configs present in the given datasets. + tasks: list[tuple[str, int]], default = None + If specified, will only consider the configs present in the given tasks. + Tasks are in the form `(dataset, fold)`. + For example, `("abalone", 1)`. union: bool, default = True If True, will return the union of configs present in each task. If False, will return the intersection of configs present in each task. @@ -169,6 +175,11 @@ def configs(self, *, datasets: List[str] = None, tasks: List[str] = None, union: def dataset_to_tid(self, dataset: str) -> int: return self._dataset_to_tid_dict[dataset] + def datasets_to_tids(self, datasets: list[str] = None) -> pd.Series: + if datasets is None: + datasets = self.datasets() + return pd.Series({dataset: self._dataset_to_tid_dict[dataset] for dataset in datasets}, name="tid") + def tid_to_dataset(self, tid: int) -> str: return self._tid_to_dataset_dict.get(tid, "Not found") @@ -357,6 +368,23 @@ def dataset_info(self, dataset: str) -> dict: """ return self._zeroshot_context.df_metrics.loc[dataset].to_dict() + def datasets_info(self, datasets: list[str] = None) -> pd.DataFrame: + """ + Parameters + ---------- + datasets: list[str]. default = None + If None, uses all datasets + + Returns + ------- + Pandas DataFrame with index "dataset" and two columns: + "metric": The evaluation metric name used for scoring on the dataset + "problem_type": The problem type of the dataset + """ + if datasets is None: + datasets = self.datasets() + return self._zeroshot_context.df_metrics.loc[datasets] + @property def folds(self) -> List[int]: """Folds with any result""" @@ -532,3 +560,126 @@ def _convert_binary_to_multiclass(self, predictions: np.ndarray, dataset: str) - return np.stack([1 - predictions, predictions], axis=predictions.ndim) else: return predictions + + # TODO: repo time_infer_s is per row, results_df is the total time for all rows, need to align later + # TODO: Error if unknown configs/baselines requested + # TODO: Add fillna + # TODO: Docstring + # Q:Whether to keep these functions a part of TabRepo or keep them separate as a part of new fit()-package + def compare_metrics( + self, + results_df: pd.DataFrame = None, + datasets: List[str] = None, + folds: List[int] = None, + configs: List[str] = None, + baselines: List[str] = None, + ) -> pd.DataFrame: + if datasets is None: + datasets = self.datasets() + columns = ["metric_error", "time_train_s", "time_infer_s", "metric", "problem_type"] + + if results_df is not None: + df_exp = results_df.reset_index().set_index(["dataset", "fold", "framework"])[columns] + else: + df_exp = None + + # Dropping task column in df_tr + df_tr = self._zeroshot_context.df_configs.set_index(["dataset", "fold", "framework"])[columns] + + mask = df_tr.index.get_level_values("dataset").isin(datasets) + if folds is not None: + mask = mask & df_tr.index.get_level_values("fold").isin(folds) + if configs is not None: + mask = mask & df_tr.index.get_level_values("framework").isin(configs) + df_tr = df_tr[mask] + + if self.task_metadata is not None: + df_tr = convert_time_infer_s_from_sample_to_batch(df_tr, repo=self) + + if self._zeroshot_context.df_baselines is not None: + df_baselines = self._zeroshot_context.df_baselines.set_index(["dataset", "fold", "framework"])[columns] + + mask = df_baselines.index.get_level_values("dataset").isin(datasets) + if folds is not None: + mask = mask & df_baselines.index.get_level_values("fold").isin(folds) + if baselines is not None: + mask = mask & df_baselines.index.get_level_values("framework").isin(baselines) + df_baselines = df_baselines[mask] + + if self.task_metadata is not None: + df_baselines = convert_time_infer_s_from_sample_to_batch(df_baselines, repo=self) + else: + if baselines: + raise AssertionError(f"Baselines specified but no baseline methods exist! (baselines={baselines})") + df_baselines = None + + df = pd.concat([df_exp, df_tr, df_baselines], axis=0) + df = df.sort_index() + + return df + + # TODO: Rename to something better? + def plot_overall_rank_comparison( + self, + results_df: pd.DataFrame, + save_dir: str, + evaluator_kwargs: dict = None, + calibration_framework: str = None, + ) -> "EvaluatorOutput": + """ + Requires `autogluon_benchmark` to be installed. + + Parameters + ---------- + results_df: pd.DataFrame + The input data to calculate metrics with. + An easy way to obtain a valid `results_df` is to call `repo.compare_metrics(...)` + It should have a multi-index of (dataset, fold, framework), with the following columns: + metric_error: float + metric: str + time_train_s: float + time_infer_s: float + problem_type: str + save_dir: str + The local directory to save comparison results and figures to. + evaluator_kwargs: dict, default = None + The evaluator kwargs. + calibration_framework: str, default = None + The framework to fix at 1000 elo. + + Returns + ------- + EvaluatorOutput object from autogluon_benchmark + """ + try: + from autogluon_benchmark.evaluation.evaluator import Evaluator + from autogluon_benchmark.plotting.plotter import Plotter + except ImportError: + raise ImportError(f"To use `repo.plot_overall_rank_comparison, you must first install autogluon_benchmark.") + if evaluator_kwargs is None: + evaluator_kwargs = {} + results_df = results_df.reset_index().copy() + results_df["tid"] = results_df["dataset"].apply(self.dataset_to_tid) + evaluator = Evaluator(task_metadata=self.task_metadata, **evaluator_kwargs) + evaluator_output = evaluator.transform(results_df) + output_path = Path(save_dir) + figure_savedir = str(output_path / "figures") + save_pd.save(path=str(output_path / "results.csv"), df=results_df) + save_pd.save(path=str(output_path / "results_ranked_agg.csv"), df=evaluator_output.results_ranked_agg) + save_pd.save(path=str(output_path / "results_ranked.csv"), df=evaluator_output.results_ranked) + + plotter = Plotter( + results_ranked_fillna_df=evaluator_output.results_ranked, + results_ranked_df=evaluator_output.results_ranked, + save_dir=figure_savedir, + show=False, + ) + + plotter.plot_all( + calibration_framework=calibration_framework, + calibration_elo=1000, + BOOTSTRAP_ROUNDS=100, # Reduce this to lower values for a faster execution. Use 1000 for the final plot. + plot_critical_difference=False, + ) + + return evaluator_output diff --git a/tabrepo/repository/ensemble_mixin.py b/tabrepo/repository/ensemble_mixin.py index 55242dae..773155e5 100644 --- a/tabrepo/repository/ensemble_mixin.py +++ b/tabrepo/repository/ensemble_mixin.py @@ -1,83 +1,296 @@ from __future__ import annotations -from typing import Tuple, Type +import itertools +from typing import Literal, Tuple, Type +import numpy as np import pandas as pd +from .time_utils import filter_configs_by_runtime, get_runtime from ..simulation.ensemble_selection_config_scorer import EnsembleScorer, EnsembleScorerMaxModels, EnsembleSelectionConfigScorer +from ..utils.parallel_for import parallel_for +# FIXME: Type hints for AbstractRepository, how to do? Protocol? class EnsembleMixin: - # TODO: rank=False by default, include way more information like fit time and infer time? - # TODO: Add time_train_s - # TODO: Add infer_limit + # TODO: rank=False by default? + # TODO: ensemble_size remove, put into ensemble_kwargs? + # TODO: rename to fit_ensemble? + # TODO: Maybe the result output should be a pd.Series or dataclass? Finalize prior to TabRepo 2.0 release. + # Ditto for ensemble_weights def evaluate_ensemble( self, - datasets: list[str], + dataset: str, + fold: int, configs: list[str] = None, *, + time_limit: float = None, ensemble_cls: Type[EnsembleScorer] = EnsembleScorerMaxModels, ensemble_kwargs: dict = None, ensemble_size: int = 100, rank: bool = True, - folds: list[int] | None = None, - backend: str = "ray", - ) -> Tuple[pd.Series, pd.DataFrame]: + fit_order: Literal["original", "random"] = "original", + seed: int = 0, + ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ - :param datasets: list of datasets to compute errors on. - :param configs: list of config to consider for ensembling. Uses all configs if None. - :param ensemble_size: number of members to select with Caruana. - :param ensemble_cls: class used for the ensemble model. - :param ensemble_kwargs: kwargs to pass to the init of the ensemble class. - :param rank: whether to return ranks or raw scores (e.g. RMSE). Ranks are computed over all base models and - automl framework. - :param folds: list of folds that need to be evaluated, use all folds if not provided. - :param backend: Options include ["native", "ray"]. - :return: Tuple: - Pandas Series of ensemble test errors per task, with multi-index (dataset, fold). - Pandas DataFrame of ensemble weights per task, with multi-index (dataset, fold). Columns are the names of each config. + Evaluates an ensemble of a list of configs on a given task (dataset, fold). + + Parameters + ---------- + dataset: str + The dataset to evaluate + fold: int + The fold of the dataset to evaluate + configs: list[str], default = None + The list of configs to consider for ensembling. + If None, will use all configs. + Models will be simulated as being fit in the order specified in `fit_order`. + time_limit: float, default = None + The time limit of the ensemble. + Will only consider the first N models in `configs` whose cumulative time limit is less than `time_limit`. + ensemble_cls: Type[EnsembleScorer], default = EnsembleScorerMaxModels + The ensemble method to use. + ensemble_kwargs: dict, default = None + The ensemble method kwargs. + ensemble_size: int, default = 100 + The number of ensemble iterations. + rank: bool, default = True + If True, additionally calculates the rank of the ensemble result. + fit_order: Literal["original", "random"], default = "original" + Whether to simulate the models being fit in their original order sequentially or randomly. + seed: int, default = 0 + The random seed used to shuffle `configs` if `fit_order="random"`. + + Returns + ------- + result: pd.DataFrame + A single-row multi-index (dataset, fold) DataFrame with the following columns: + metric_error: float + The ensemble's metric test error. + metric: str + The target evaluation metric. + time_train_s: float + The training time of the ensemble in seconds (the sum of all considered models' time_train_s) + time_infer_s: float + The inference time of the ensemble in seconds (the sum of all non-zero weight models' time_infer_s) + problem_type: str + The problem type of the task. + metric_error_val: float + The ensemble's metric validation error. + ensemble_weights: pd.DataFrame + A single-row multi-index (dataset, fold) DataFrame with column names equal to `configs`. + Each config column's value is the weight given to it by the ensemble model. + This can be used for debugging purposes and for deeper analysis. + """ - if folds is None: - folds = self.folds + task = self.task_name(dataset=dataset, fold=fold) if configs is None: - configs = self.configs() - tasks = [ - self.task_name(dataset=dataset, fold=fold) - for dataset in datasets - for fold in folds - ] + configs = self.configs(tasks=[(dataset, fold)]) + + if time_limit is not None: + if fit_order == "random": + # randomly shuffle the configs + rng = np.random.default_rng(seed=seed) + configs_fit_order = list(rng.permuted(configs)) + else: + configs_fit_order = configs + + # filter configs to the first N configs whose combined time_limit is less than the provided time_limit + configs = filter_configs_by_runtime(repo=self, dataset=dataset, fold=fold, config_names=configs_fit_order, max_cumruntime=time_limit) + + if len(configs) == 0: + # if not enough time to fit any model, use the fallback config if it exists, even if it would be over the time limit + # if no config fallback was specified, then raise an AssertionError + if self._config_fallback is None: + if len(configs_fit_order) > 0: + raise AssertionError( + f"Can't fit an ensemble with no configs when self._config_fallback is None " + f"(No configs are trainable in the provided time_limit={time_limit}.)" + ) + else: + raise AssertionError(f"Can't fit an ensemble with no configs when self._config_fallback is None.") + configs = [self._config_fallback] + scorer = self._construct_ensemble_selection_config_scorer( - tasks=tasks, + tasks=[task], ensemble_size=ensemble_size, ensemble_cls=ensemble_cls, ensemble_kwargs=ensemble_kwargs, - backend=backend, + backend="native", + ) + + # fit the ensemble and retrieve the metric error and ensemble weights + results = scorer.compute_errors(configs=configs) + metric_error = results[task]["metric_error"] + ensemble_weights = results[task]["ensemble_weights"] + metric_error_val = results[task]["metric_error_val"] + + dataset_info = self.dataset_info(dataset=dataset) + metric = dataset_info["metric"] + problem_type = dataset_info["problem_type"] + + # select configurations used in the ensemble as infer time only depends on the models with non-zero weight. + fail_if_missing = self._config_fallback is None + + # compute the ensemble time_train_s by summing all considered config's time_train_s + runtimes = get_runtime( + repo=self, + dataset=dataset, + fold=fold, + config_names=configs, + runtime_col='time_train_s', + fail_if_missing=fail_if_missing, + ) + time_train_s = sum(runtimes.values()) + + # compute the ensemble time_infer_s by summing all considered config's time_infer_s that have non-zero weight + config_selected_ensemble = [ + config for i, config in enumerate(configs) if ensemble_weights[i] != 0 + ] + latencies = get_runtime( + repo=self, + dataset=dataset, + fold=fold, + config_names=config_selected_ensemble, + runtime_col='time_infer_s', + fail_if_missing=fail_if_missing, ) + time_infer_s = sum(latencies.values()) - dict_errors, dict_ensemble_weights = scorer.compute_errors(configs=configs) + output_dict = { + "metric_error": [metric_error], + "metric": [metric], + "time_train_s": [time_train_s], + "time_infer_s": [time_infer_s], + "problem_type": [problem_type], + "metric_error_val": [metric_error_val], + } if rank: - dict_scores = scorer.compute_ranks(errors=dict_errors) - out = dict_scores - else: - out = dict_errors + dict_ranks = scorer.compute_ranks(errors={task: metric_error}) + rank_list = dict_ranks[task] + output_dict["rank"] = [rank_list] - dataset_folds = [(self.task_to_dataset(task=task), self.task_to_fold(task=task)) for task in tasks] - ensemble_weights = [dict_ensemble_weights[task] for task in tasks] - out_list = [out[task] for task in tasks] + multiindex = pd.MultiIndex.from_tuples([(dataset, fold)], names=["dataset", "fold"]) + df_ensemble_weights = pd.DataFrame(data=[ensemble_weights], index=multiindex, columns=configs) + df_out = pd.DataFrame(data=output_dict, index=multiindex) - multiindex = pd.MultiIndex.from_tuples(dataset_folds, names=["dataset", "fold"]) + return df_out, df_ensemble_weights - df_name = "rank" if rank else "error" - df_out = pd.Series(data=out_list, index=multiindex, name=df_name) - df_ensemble_weights = pd.DataFrame(data=ensemble_weights, index=multiindex, columns=configs) + # TODO: Docstring + def evaluate_ensembles( + self, + datasets: list[str] = None, + folds: list[int] = None, + configs: list[str] = None, + *, + ensemble_cls: Type[EnsembleScorer] = EnsembleScorerMaxModels, + ensemble_kwargs: dict = None, + ensemble_size: int = 100, + time_limit: float = None, + fit_order: Literal["original", "random"] = "original", + seed: int = 0, + rank: bool = True, + backend: Literal["ray", "native"] = "ray", + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Evaluates an ensemble of a list of configs on a given set of tasks (datasets x folds). + + Identical to calling `evaluate_ensemble` once for each task and then concatenating the results, + however this method will be much faster due to parallelization. + + Parameters + ---------- + datasets: list[str], default = None + The datasets to evaluate. + If None, will use all datasets. + folds: list[int], default = None + The folds of the dataset to evaluate. + If None, will use all folds. + configs: list[str], default = None + The list of configs to consider for ensembling. + If None, will use all configs. + Models will be simulated as being fit in the order specified in `fit_order`. + time_limit: float, default = None + The time limit of the ensemble. + Will only consider the first N models in `configs` whose cumulative time limit is less than `time_limit`. + ensemble_cls: Type[EnsembleScorer], default = EnsembleScorerMaxModels + The ensemble method to use. + ensemble_kwargs: dict, default = None + The ensemble method kwargs. + ensemble_size: int, default = 100 + The number of ensemble iterations. + rank: bool, default = True + If True, additionally calculates the rank of the ensemble result. + fit_order: Literal["original", "random"], default = "original" + Whether to simulate the models being fit in their original order sequentially or randomly. + seed: int, default = 0 + The random seed used to shuffle `configs` if `fit_order="random"`. + backend: Literal["ray", "native"], default = "ray" + The backend to use when running the list of tasks. + + Returns + ------- + result: pd.DataFrame + A multi-index (dataset, fold) DataFrame where each row corresponds to a task, with the following columns: + metric_error: float + The ensemble's metric test error. + metric: str + The target evaluation metric. + time_train_s: float + The training time of the ensemble in seconds (the sum of all considered models' time_train_s) + time_infer_s: float + The inference time of the ensemble in seconds (the sum of all non-zero weight models' time_infer_s) + problem_type: str + The problem type of the task. + metric_error_val: float + The ensemble's metric validation error. + ensemble_weights: pd.DataFrame + A multi-index (dataset, fold) DataFrame with column names equal to `configs`. Each row corresponds to a task. + Each config column's value is the weight given to it by the ensemble model. + This can be used for debugging purposes and for deeper analysis. + + """ + if backend == "native": + backend = "sequential" + if folds is None: + folds = self.folds + if datasets is None: + datasets = self.datasets() + + context = dict( + self=self, + configs=configs, + ensemble_cls=ensemble_cls, + ensemble_kwargs=ensemble_kwargs, + ensemble_size=ensemble_size, + time_limit=time_limit, + fit_order=fit_order, + seed=seed, + rank=rank, + ) + + inputs = list(itertools.product(datasets, folds)) + inputs = [{"dataset": dataset, "fold": fold} for dataset, fold in inputs] + + list_rows = parallel_for( + self.__class__.evaluate_ensemble, + inputs=inputs, + context=context, + engine=backend, + ) + + df_out = pd.concat([l[0] for l in list_rows], axis=0) + df_ensemble_weights = pd.concat([l[1] for l in list_rows], axis=0) # FIXME: Is this guaranteed same columns in each? return df_out, df_ensemble_weights - def _construct_ensemble_selection_config_scorer(self, - ensemble_size: int = 10, - backend='ray', - **kwargs) -> EnsembleSelectionConfigScorer: + def _construct_ensemble_selection_config_scorer( + self, + ensemble_size: int = 10, + backend: str = 'ray', + **kwargs + ) -> EnsembleSelectionConfigScorer: config_scorer = EnsembleSelectionConfigScorer.from_repo( repo=self, ensemble_size=ensemble_size, # 100 is better, but 10 allows to simulate 10x faster diff --git a/tabrepo/repository/evaluation_repository.py b/tabrepo/repository/evaluation_repository.py index 4bb719bc..2e8d6815 100644 --- a/tabrepo/repository/evaluation_repository.py +++ b/tabrepo/repository/evaluation_repository.py @@ -1,9 +1,12 @@ from __future__ import annotations import copy -from typing import List +import os +from pathlib import Path +from typing import Any, List, Literal import numpy as np +import pandas as pd from typing_extensions import Self from .abstract_repository import AbstractRepository @@ -183,8 +186,145 @@ def _construct_config_scorer(self, raise ValueError(f'Invalid config_scorer_type: {config_scorer_type}') @classmethod - def from_context(cls, version: str = None, prediction_format: str = "memmap"): - return load_repository(version=version, prediction_format=prediction_format) + def from_context(cls, version: str = None, cache: bool = False, prediction_format: str = "memmap") -> Self: + return load_repository(version=version, cache=cache, prediction_format=prediction_format) + + # TODO: 1. Cleanup results_lst_simulation_artifacts, 2. Make context work with tasks instead of datasets x folds + # TODO: Get raw data from repo method (X, y) + # TODO: Score task repo method? + # TODO: Remove score_vs_only_baselines and pct args from zeroshot_context? + # TODO: unit test + # TODO: docstring + # FIXME: Support memmap directly, without needing full `results_lst_simulation_artifacts` in-memory + @classmethod + def from_raw( + cls, + df_configs: pd.DataFrame, + results_lst_simulation_artifacts: list[dict[str, dict[int, dict]]], + df_baselines: pd.DataFrame = None, + task_metadata: pd.DataFrame = None, + configs_hyperparameters: dict[str, dict[str, Any]] = None, + pct: bool = False, + score_against_only_baselines: bool = False, + ) -> Self: + from tabrepo.predictions import TabularPredictionsInMemory + from tabrepo.simulation.ground_truth import GroundTruth + from tabrepo.simulation.simulation_context import ZeroshotSimulatorContext + from autogluon.common.utils.simulation_utils import convert_simulation_artifacts_to_tabular_predictions_dict + + required_columns = [ + "dataset", + "fold", + "framework", + "metric_error", + "metric", + "problem_type", + "time_train_s", + "time_infer_s", + ] + + for column in required_columns: + if column not in df_configs: + raise AssertionError(f"Missing required column in df_configs: {column}\ndf_configs columns: {list(df_configs.columns)}") + + simulation_artifacts_full = cls._convert_sim_artifacts(results_lst_simulation_artifacts=results_lst_simulation_artifacts) + + zeroshot_pp, zeroshot_gt = convert_simulation_artifacts_to_tabular_predictions_dict(simulation_artifacts=simulation_artifacts_full) + + predictions = TabularPredictionsInMemory.from_dict(zeroshot_pp) + ground_truth = GroundTruth.from_dict(zeroshot_gt) + + zeroshot_context = ZeroshotSimulatorContext( + df_configs=df_configs, + df_baselines=df_baselines, + df_metadata=task_metadata, + configs_hyperparameters=configs_hyperparameters, + pct=pct, + score_against_only_baselines=score_against_only_baselines, + ) + + repo = cls( + zeroshot_context=zeroshot_context, + tabular_predictions=predictions, + ground_truth=ground_truth, + ) + + return repo + + def to_dir(self, path: str): + from tabrepo.contexts.context import BenchmarkContext, construct_context + + path = os.path.abspath(path) + os.path.sep + path_data_dir = path + "model_predictions/" + + # FIXME: use tasks rather than datasets and folds separately + datasets = self.datasets() + folds = self.folds + if folds is not None: + # make list serializable to json + folds = [int(f) for f in folds] + + self._tabular_predictions.to_data_dir(data_dir=path_data_dir) + self._ground_truth.to_data_dir(data_dir=path_data_dir) + metadata = self._zeroshot_context.to_dir(path=path) + + configs_hyperparameters = metadata["configs_hyperparameters"] + if configs_hyperparameters is not None: + configs_hyperparameters = [configs_hyperparameters] + + # FIXME: Make this a repo constructor method? + # FIXME: s3_download_map doesn't work with is_relative yet + context: BenchmarkContext = construct_context( + name=None, + datasets=datasets, + folds=folds, + local_prefix=path, + local_prefix_is_relative=False, # TODO: Set to False by default and rename + has_baselines=metadata["df_baselines"] is not None, + task_metadata=metadata["df_metadata"], + configs_hyperparameters=configs_hyperparameters, + is_relative=True, + config_fallback=self._config_fallback, + ) + + context.to_json(path=str(Path(path) / "context.json")) + + @classmethod + def from_dir( + cls, + path: str, + prediction_format: Literal["memmap", "memopt", "mem"] = "memmap", + update_relative_path: bool = True, + ) -> Self: + from tabrepo.contexts.context import BenchmarkContext + + path_context = str(Path(path) / "context.json") + context = BenchmarkContext.from_json(path=path_context) + if update_relative_path: + context.benchmark_paths.relative_path = str(Path(path)) + + repo = context.load_repo(prediction_format=prediction_format) + return repo + + @classmethod + def _convert_sim_artifacts(cls, results_lst_simulation_artifacts: list[dict[str, dict[int, dict[str, Any]]]]) -> dict[str, dict[int, dict[str, Any]]]: + # FIXME: Don't require all results in memory at once + simulation_artifacts_full = {} + for simulation_artifacts in results_lst_simulation_artifacts: + for k in simulation_artifacts.keys(): + if k not in simulation_artifacts_full: + simulation_artifacts_full[k] = {} + for f in simulation_artifacts[k]: + if f not in simulation_artifacts_full[k]: + simulation_artifacts_full[k][f] = copy.deepcopy(simulation_artifacts[k][f]) + else: + for method in simulation_artifacts[k][f]["pred_proba_dict_val"]: + if method in simulation_artifacts_full[k][f]["pred_proba_dict_val"]: + raise AssertionError(f"Two results exist for dataset {k}, fold {f}, method {method}!") + else: + simulation_artifacts_full[k][f]["pred_proba_dict_val"][method] = simulation_artifacts[k][f]["pred_proba_dict_val"][method] + simulation_artifacts_full[k][f]["pred_proba_dict_test"][method] = simulation_artifacts[k][f]["pred_proba_dict_test"][method] + return simulation_artifacts_full def load_repository(version: str, *, load_predictions: bool = True, cache: bool | str = False, prediction_format: str = "memmap") -> EvaluationRepository: diff --git a/tabrepo/repository/repo_utils.py b/tabrepo/repository/repo_utils.py new file mode 100644 index 00000000..b246f41a --- /dev/null +++ b/tabrepo/repository/repo_utils.py @@ -0,0 +1,40 @@ +from typing import TYPE_CHECKING + +import pandas as pd + +if TYPE_CHECKING: + from .abstract_repository import AbstractRepository + + +def convert_time_infer_s_from_sample_to_batch(df: pd.DataFrame, repo: "AbstractRepository") -> pd.DataFrame: + """ + Temp: Multiply by 0.1 since 90% of the instances are used for training and 10% for test + # TODO: Change this in future, not all tasks will use 90% train / 10% test. Instead keep track of train/test rows per dataset_fold pair. + """ + df = df.copy(deep=True) + if "dataset" in df.columns: + df["time_infer_s"] = df["time_infer_s"] * df["dataset"].map( + repo.task_metadata.set_index("dataset")["NumberOfInstances"] + ) * 0.1 + else: + df["time_infer_s"] = df["time_infer_s"] * df.index.get_level_values("dataset").map( + repo.task_metadata.set_index("dataset")["NumberOfInstances"] + ) * 0.1 + return df + + +def convert_time_infer_s_from_batch_to_sample(df: pd.DataFrame, repo: "AbstractRepository") -> pd.DataFrame: + """ + Temp: Multiply by 0.1 since 90% of the instances are used for training and 10% for test + # TODO: Change this in future, not all tasks will use 90% train / 10% test. Instead keep track of train/test rows per dataset_fold pair. + """ + df = df.copy(deep=True) + if "dataset" in df.columns: + df["time_infer_s"] = df["time_infer_s"] / (df["dataset"].map( + repo.task_metadata.set_index("dataset")["NumberOfInstances"] + ) * 0.1) + else: + df["time_infer_s"] = df["time_infer_s"] / (df.index.get_level_values("dataset").map( + repo.task_metadata.set_index("dataset")["NumberOfInstances"] + ) * 0.1) + return df diff --git a/tabrepo/repository/time_utils.py b/tabrepo/repository/time_utils.py index ee190617..a1a84a09 100644 --- a/tabrepo/repository/time_utils.py +++ b/tabrepo/repository/time_utils.py @@ -1,34 +1,36 @@ from typing import List, Optional, Dict import numpy as np import pandas as pd -from tabrepo.repository.evaluation_repository import EvaluationRepository +from .abstract_repository import AbstractRepository + +# FIXME: Should move into repo? def get_runtime( - repo: EvaluationRepository, - tid: int, + repo: AbstractRepository, + dataset: str, fold: int, config_names: Optional[List[str]] = None, - task_col: str = "task", runtime_col: str = 'time_train_s', fail_if_missing: bool = True ) -> Dict[str, float]: """ :param repo: - :param tid: + :param dataset: :param fold: :param config_names: :param fail_if_missing: whether to raise an error if some configurations are missing :return: a dictionary with keys are elements in `config_names` and the values are runtimes of the configuration on the task `tid`_`fold`. """ - dataset = repo.tid_to_dataset(tid=tid) task = repo.task_name(dataset=dataset, fold=fold) if not config_names: config_names = repo.configs() - df_metrics = repo._zeroshot_context.df_configs_ranked - df_configs = pd.DataFrame(config_names, columns=["framework"]).merge(df_metrics[df_metrics[task_col] == task]) - runtime_configs = dict(df_configs.set_index('framework')[runtime_col]) + config_metrics = repo.metrics(datasets=[dataset], folds=[fold], configs=config_names) + runtime_series = config_metrics[runtime_col] + runtime_series.index = runtime_series.index.get_level_values("framework") + runtime_configs = runtime_series.to_dict() + missing_configurations = set(config_names).difference(runtime_configs.keys()) if len(missing_configurations) > 0: if fail_if_missing: @@ -40,9 +42,8 @@ def get_runtime( else: # todo take mean of framework if repo._config_fallback is not None: - df_configs_fallback = pd.DataFrame([repo._config_fallback], columns=["framework"]).merge(df_metrics[df_metrics[task_col] == task]) - runtime_configs_fallback = dict(df_configs_fallback.set_index('framework')[runtime_col]) - fill_value = runtime_configs_fallback[repo._config_fallback] + config_metrics_fallback = repo.metrics(datasets=[dataset], folds=[fold], configs=[repo._config_fallback]) + fill_value = config_metrics_fallback.loc[(dataset, fold, repo._config_fallback), runtime_col] else: fill_value = np.mean(list(runtime_configs.values())) # print(f"Imputing missing value {fill_value} for configurations {missing_configurations} on task {task}") @@ -52,7 +53,7 @@ def get_runtime( def sort_by_runtime( - repo: EvaluationRepository, + repo: AbstractRepository, config_names: List[str], ascending: bool = True, ) -> List[str]: @@ -64,8 +65,8 @@ def sort_by_runtime( def filter_configs_by_runtime( - repo: EvaluationRepository, - tid: int, + repo: AbstractRepository, + dataset: str, fold: int, config_names: List[str], max_cumruntime: Optional[float] = None @@ -82,14 +83,14 @@ def filter_configs_by_runtime( if not max_cumruntime: return config_names else: - assert tid in repo.tids() + assert dataset in repo.datasets() assert fold in repo.folds - runtime_configs = get_runtime(repo=repo, tid=tid, fold=fold, config_names=config_names, fail_if_missing=False) - cumruntime = np.cumsum(list(runtime_configs.values())) + runtime_configs = get_runtime(repo=repo, dataset=dataset, fold=fold, config_names=config_names, fail_if_missing=False) + cumruntime = np.cumsum([runtime_configs[config] for config in config_names]) # str_runtimes = ", ".join([f"{name}: {time}" for name, time in zip(runtime_configs.keys(), cumruntime)]) # print(f"Cumulative runtime:\n {str_runtimes}") - # gets index where cumulative runtime is bellow the target and next index is above the target + # gets index where cumulative runtime is below the target and next index is above the target i = np.searchsorted(cumruntime, max_cumruntime) return config_names[:i] diff --git a/tabrepo/simulation/ensemble_selection_config_scorer.py b/tabrepo/simulation/ensemble_selection_config_scorer.py index ace70cd6..5168df1c 100644 --- a/tabrepo/simulation/ensemble_selection_config_scorer.py +++ b/tabrepo/simulation/ensemble_selection_config_scorer.py @@ -4,12 +4,12 @@ from typing import Dict, List, Optional, Tuple, Type, Union, TYPE_CHECKING import numpy as np -import ray from autogluon.core.metrics import get_metric, Scorer from autogluon.core.models.greedy_ensemble.ensemble_selection import EnsembleSelection from .configuration_list_scorer import ConfigurationListScorer +from ..utils.parallel_for import parallel_for from ..utils.rank_utils import RankScorer from ..utils import task_to_tid_fold from ..metrics import _fast_log_loss, _fast_roc_auc @@ -18,21 +18,32 @@ from ..repository.evaluation_repository import EvaluationRepository -@ray.remote -def compute_error_ray(config_scorer, configs: List[str], task: str) -> (float, dict): - error, ensemble_weights = config_scorer.evaluate_task(task=task, models=configs) - return error, ensemble_weights - - class EnsembleScorer: def __init__(self, repo: "EvaluationRepository", task_metrics_metadata, - ensemble_method: callable = EnsembleSelection, + ensemble_method: Type = EnsembleSelection, ensemble_method_kwargs: dict = None, proxy_fit_metric_map: dict = None, use_fast_metrics: bool = True, + optimize_on: str = "val", + return_metric_error_val: bool = True, ): + """ + Parameters + ---------- + repo: EvaluationRepository + task_metrics_metadata + ensemble_method: Type = EnsembleSelection + ensemble_method_kwargs: dict, default = None + proxy_fit_metric_map: dict, default = None + use_fast_metrics: bool, default = True + optimize_on: str, default = "val" + If "val", optimizes on the validation data (normal process that mirrors what can be done in practice) + If "test", optimizes on the test data (cheat mode, use this only for debugging and testing generalization gaps) + return_metric_error_val: bool, default = True + If True, will compute and return `metric_error_val` using the fitted ensemble in the output dict of `evaluate_task`. + """ if proxy_fit_metric_map is None: proxy_fit_metric_map = dict() if ensemble_method_kwargs is None: @@ -46,6 +57,10 @@ def __init__(self, self.task_metrics_metadata = task_metrics_metadata self.proxy_fit_metric_map = proxy_fit_metric_map self.use_fast_metrics = use_fast_metrics + assert isinstance(optimize_on, str) + assert optimize_on in ["val", "test"] + self.optimize_on = optimize_on + self.return_metric_error_val = return_metric_error_val def _get_metric_from_name(self, metric_name: str, problem_type: str) -> Scorer: if self.use_fast_metrics: @@ -81,20 +96,33 @@ def filter_models(self, dataset: str, fold: int, models: List[str]) -> List[str] """ return models - def evaluate_task(self, dataset: str, fold: int, models: List[str]) -> Tuple[float, np.array]: + def evaluate_task(self, dataset: str, fold: int, models: List[str]) -> dict[str, object]: n_models = len(models) task_metadata = self.task_metrics_metadata[dataset] metric_name = task_metadata["metric"] problem_type = task_metadata["problem_type"] - y_val = self.repo.labels_val(dataset=dataset, fold=fold) + y_val_og = self.repo.labels_val(dataset=dataset, fold=fold) y_test = self.repo.labels_test(dataset=dataset, fold=fold) # If filtering models, need to keep track of original model order to return ensemble weights list models_filtered = self.filter_models(dataset=dataset, fold=fold, models=models) models, models_filtered_idx = self._get_models_filtered_idx(models=models, models_filtered=models_filtered) - pred_val, pred_test = self.get_preds_from_models(dataset=dataset, fold=fold, models=models) + pred_val_og, pred_test = self.get_preds_from_models(dataset=dataset, fold=fold, models=models) + + if self.optimize_on == "val": + # Use the original validation data for a fair comparison that mirrors what happens in practice + y_val = y_val_og + pred_val = pred_val_og + elif self.optimize_on == "test": + # Optimize directly on test (unrealistic, but can be used to measure the gap in generalization) + # TODO: Another variant that could be implemented, do 50% of test as val and the rest as test + # to simulate impact of using holdout validation + y_val = copy.deepcopy(y_test) + pred_val = copy.deepcopy(pred_test) + else: + raise ValueError(f"Invalid value for `optimize_on`: {self.optimize_on}") if problem_type == 'binary': # Force binary prediction probabilities to 1 dimensional prediction probabilites of the positive class @@ -140,6 +168,16 @@ def evaluate_task(self, dataset: str, fold: int, models: List[str]) -> Tuple[flo y_test_pred = weighted_ensemble.predict_proba(pred_test) err = eval_metric.error(y_test, y_test_pred) + metric_error_val = None + if self.return_metric_error_val: + if hasattr(eval_metric, 'preprocess_bulk'): + y_val_og, pred_val_og = eval_metric.preprocess_bulk(y_val_og, pred_val_og) + if eval_metric.needs_pred: + y_val_pred = weighted_ensemble.predict(pred_val_og) + else: + y_val_pred = weighted_ensemble.predict_proba(pred_val_og) + metric_error_val = eval_metric.error(y_val_og, y_val_pred) + ensemble_weights: np.array = weighted_ensemble.weights_ # ensemble_weights has to be updated, need to be in the original models order @@ -147,7 +185,14 @@ def evaluate_task(self, dataset: str, fold: int, models: List[str]) -> Tuple[flo ensemble_weights_fixed[models_filtered_idx] = ensemble_weights ensemble_weights = ensemble_weights_fixed - return err, ensemble_weights + results = dict( + metric_error=err, + ensemble_weights=ensemble_weights, + ) + if self.return_metric_error_val: + results["metric_error_val"] = metric_error_val + + return results def _get_models_filtered_idx(self, models: list[str], models_filtered: list[str]) -> Tuple[list[str], list[int]]: """ @@ -339,48 +384,46 @@ def from_repo(cls, repo: "EvaluationRepository", **kwargs): **kwargs, ) - def evaluate_task(self, task: str, models: List[str]) -> Tuple[float, np.array]: + def evaluate_task(self, task: str, models: List[str]) -> dict[str, object]: tid, fold = task_to_tid_fold(task=task) dataset = self.tid_to_dataset_name_dict[tid] return self.ensemble_scorer.evaluate_task(dataset=dataset, fold=fold, models=models) - def compute_errors(self, configs: List[str]) -> Tuple[Dict[str, float], Dict[str, np.array]]: + def compute_errors(self, configs: list[str]) -> dict[str, dict[str, object]]: """ Compute and return test errors and ensemble weights for all tasks on the user-specified list of configs. :param configs: List of model config names to ensemble and compute test errors with. - :return: Tuple: - Dictionary of task_name -> test evaluation metric error of the ensemble. - Dictionary of task_name -> model weights in the ensemble. Model weights are stored in a numpy array, - with weights corresponding to the order of `configs`. + :return: dict: + task -> dict: + metric_error: test evaluation metric error of the ensemble. + metric_error_val: val evaluation metric error of the ensemble. + ensemble_weights: model weights in the ensemble. Model weights are stored in a numpy array, with weights corresponding to the order of `configs`. """ - if self.backend == 'ray': - return self.compute_errors_ray(configs=configs) - errors = dict() - ensemble_weights = dict() - for task in self.tasks: - errors[task], ensemble_weights[task] = self.evaluate_task(task=task, models=configs) - return errors, ensemble_weights - - # speedup can be obtained by only sending minimum zeroshot pred proba info for each task by using lazy format - def compute_errors_ray(self, configs: List[str]) -> Tuple[Dict[str, float], Dict[str, np.array]]: - # Create and execute all tasks in parallel - if not ray.is_initialized(): - ray.init() - config_scorer = ray.put(self) - results = [] - for i in range(len(self.tasks)): - results.append(compute_error_ray.remote( - config_scorer, - configs, - self.tasks[i], - )) - results_list = ray.get(results) - errors_list = [r[0] for r in results_list] - ensemble_weights_list = [r[1] for r in results_list] - errors = {self.tasks[i]: errors_list[i] for i in range(len(self.tasks))} - ensemble_weights = {self.tasks[i]: ensemble_weights_list[i] for i in range(len(self.tasks))} - return errors, ensemble_weights + engine = self.backend + if engine == "native": + engine = "sequential" + + context = dict( + self=self, + models=configs, + ) + + if engine == "sequential": + progress_bar = False + else: + progress_bar = True + + inputs = [{"task": task} for task in self.tasks] + results_rows = parallel_for( + self.__class__.evaluate_task, + inputs=inputs, + context=context, + engine=engine, + progress_bar=progress_bar, + ) + results = {task: result for task, result in zip(self.tasks, results_rows)} + return results def compute_ranks(self, errors: Dict[str, float]) -> Dict[str, float]: ranks = {} diff --git a/tabrepo/simulation/ground_truth.py b/tabrepo/simulation/ground_truth.py index 4e6b8b00..5d971236 100644 --- a/tabrepo/simulation/ground_truth.py +++ b/tabrepo/simulation/ground_truth.py @@ -1,12 +1,13 @@ +from __future__ import annotations + import tempfile -from typing import Dict, List import pandas as pd from pathlib import Path class GroundTruth: - def __init__(self, label_val_dict: Dict[str, Dict[int, pd.Series]], label_test_dict: Dict[str, Dict[int, pd.Series]]): + def __init__(self, label_val_dict: dict[str, dict[int, pd.Series]], label_test_dict: dict[str, dict[int, pd.Series]]): """ :param label_val_dict: dictionary from tid to fold to labels series where the index are openml rows and @@ -18,9 +19,12 @@ def __init__(self, label_val_dict: Dict[str, Dict[int, pd.Series]], label_test_d self._label_test_dict = label_test_dict @property - def datasets(self) -> List[str]: + def datasets(self) -> list[str]: return sorted(list(self._label_val_dict.keys())) + def dataset_folds(self, dataset: str) -> list[int]: + return sorted(list(self._label_val_dict[dataset].keys())) + # FIXME: Add restrict instead, same as tabular_predictions def remove_dataset(self, dataset: str): self._label_val_dict.pop(dataset) diff --git a/tabrepo/simulation/simulation_context.py b/tabrepo/simulation/simulation_context.py index 6b92fa2f..608f7413 100644 --- a/tabrepo/simulation/simulation_context.py +++ b/tabrepo/simulation/simulation_context.py @@ -4,14 +4,18 @@ from collections import defaultdict import json from pathlib import Path -from typing import Optional, List, Union, Tuple +from typing import Any, Optional, List, Union, Tuple +from typing_extensions import Self import pandas as pd +from autogluon.common.loaders import load_json, load_pd +from autogluon.common.savers import save_json, save_pd from .ground_truth import GroundTruth from .sim_utils import get_dataset_to_tid_dict, get_task_to_dataset_dict, filter_datasets, get_dataset_to_metric_problem_type from ..predictions import TabularModelPredictions, TabularPredictionsMemmap, TabularPredictionsInMemory, TabularPredictionsInMemoryOpt +from ..utils import task_to_tid_fold from ..utils.rank_utils import RankScorer @@ -26,7 +30,7 @@ def __init__( df_configs: pd.DataFrame, df_baselines: pd.DataFrame = None, df_metadata: pd.DataFrame = None, - configs_hyperparameters: dict = None, + configs_hyperparameters: dict[str, dict[str, Any]] = None, folds: List[int] | None = None, pct: bool = False, score_against_only_baselines: bool = True, @@ -65,6 +69,8 @@ def __init__( score_against_only_automl=self.score_against_only_baselines, pct=self.pct, ) + if self.folds is None: + self.folds = sorted(list(self.df_configs["fold"].unique())) self.dataset_to_tasks_dict = self._compute_dataset_to_tasks() self.dataset_to_problem_type_dict = self.df_configs_ranked[['dataset', 'problem_type']].drop_duplicates().set_index( @@ -139,10 +145,15 @@ def _align_valid_folds(cls, assert len(dataset_problem_types) == len(dataset_problem_types["dataset"].unique()), \ "Error: datasets exist in `df_configs` that contain multiple problem_types!" + if df_metadata is not None: + assert "dataset" in df_metadata, (f"Missing required `dataset` column in metadata.\n" + f"Columns: {list(df_metadata.columns)}") + assert len(df_metadata) == len(df_metadata["dataset"].unique()) + if df_baselines is not None: dataset_problem_types_comparison = df_baselines[["dataset", "problem_type"]].drop_duplicates() assert len(dataset_problem_types_comparison) == len(dataset_problem_types_comparison["dataset"].unique()), \ - "Error: datasets exist in `df_comparison` that contain multiple problem_types!" + "Error: datasets exist in `df_baselines` that contain multiple problem_types!" dataset_problem_types_map_configs = dataset_problem_types.set_index("dataset").squeeze(axis=1).to_dict() dataset_problem_types_map_baselines = dataset_problem_types_comparison.set_index("dataset").squeeze(axis=1).to_dict() for d in dataset_problem_types_map_configs.keys(): @@ -155,16 +166,29 @@ def _align_valid_folds(cls, f"\tdf_baselines: {problem_type_baselines}") if "tid" not in df_configs.columns: - print(f"Note: `tid` is missing from `df_configs` columns. `tid` will be created by mapping the sorted unique `dataset` values to [0, n-1]. " - f"These values will be unrelated to OpenML task IDs.") - df_configs = df_configs.copy(deep=True) - datasets = sorted(list(df_configs["dataset"].unique())) - dataset_to_tid_map = {d: i for i, d in enumerate(datasets)} - df_configs["tid"] = df_configs["dataset"].map(dataset_to_tid_map).astype(int) + if df_metadata is not None and "tid" in df_metadata.columns: + dataset_tid_map = df_metadata.set_index("dataset")["tid"] + df_configs["tid"] = df_configs["dataset"].map(dataset_tid_map) + else: + print(f"Note: `tid` is missing from `df_configs` columns. `tid` will be created by mapping the sorted unique `dataset` values to [0, n-1]. " + f"These values will be unrelated to OpenML task IDs.") + df_configs = df_configs.copy(deep=True) + datasets = sorted(list(df_configs["dataset"].unique())) + dataset_to_tid_map = {d: i for i, d in enumerate(datasets)} + df_configs["tid"] = df_configs["dataset"].map(dataset_to_tid_map).astype(int) # assert that each dataset-tid combination is exclusive dataset_tid = df_configs[["dataset", "tid"]].drop_duplicates() - assert len(dataset_tid) == len(dataset_tid["dataset"].unique()) + if len(dataset_tid) != len(dataset_tid["dataset"].unique()): + dataset_counts = dataset_tid["dataset"].value_counts() + non_unique_datasets = dataset_counts[dataset_counts > 1] + dataset_tid_invalid = dataset_tid[dataset_tid["dataset"].isin(non_unique_datasets.index)].sort_values(by=["dataset", "tid"]).reset_index(drop=True) + print(dataset_tid_invalid) + raise ValueError( + f"{len(non_unique_datasets)} invalid datasets encountered! Datasets contain different task IDs (tid) within `df_configs`. " + f"Ensure the tid is unique.\nInvalid Datasets:\n{dataset_tid_invalid}" + ) + assert len(dataset_tid) == len(dataset_tid["tid"].unique()) if df_baselines is not None: @@ -238,8 +262,6 @@ def _align_valid_folds(cls, df_metrics = get_dataset_to_metric_problem_type(df=df_configs) if df_metadata is not None: - assert "dataset" in df_metadata, (f"Missing required `dataset` column in metadata.\n" - f"Columns: {list(df_metadata.columns)}") df_metadata = copy.deepcopy(df_metadata) df_metadata = df_metadata[df_metadata["dataset"].isin(unique_datasets)] assert sorted(list(df_metadata["dataset"].unique())) == sorted(list(unique_datasets)) @@ -360,9 +382,12 @@ def get_tids(self, problem_type=None) -> List[int]: tids = [self.dataset_to_tid_dict[dataset] for dataset in datasets] return tids - def get_tasks(self, - datasets: Optional[List[str]] = None, - problem_type: Optional[Union[str, List[str]]] = None) -> List[str]: + def get_tasks( + self, + datasets: list[str] = None, + problem_type: str | list[str] = None, + as_dataset_fold: bool = False, + ) -> list[str] | list[tuple[str, int]]: """ :param datasets: a list of dataset parent names, only return folds that have a parent in this list :param problem_type: a problem type from AutoGluon in "multiclass", "binary", ... or list of problem types @@ -377,8 +402,15 @@ def get_tasks(self, if not isinstance(problem_type, list): problem_type = [problem_type] tasks = [task for task in tasks if self.dataset_to_problem_type_dict[self.task_to_dataset_dict[task]] in problem_type] + if as_dataset_fold: + tasks = [self._task_to_dataset_fold(task) for task in tasks] return tasks + def _task_to_dataset_fold(self, task: str) -> tuple[str, int]: + tid, fold = task_to_tid_fold(task=task) + dataset = self.tid_to_dataset_dict[tid] + return dataset, fold + def _get_tasks_from_datasets(self, datasets: List[str]): dataset_folds = [] for d in datasets: @@ -393,7 +425,7 @@ def tid_to_dataset_dict(self): def task_name_from_tid(tid: int, fold: int) -> str: return f"{tid}_{fold}" - def get_configs(self, *, datasets: List[str] = None, tasks: List[str] = None, union: bool = True) -> List[str]: + def get_configs(self, *, datasets: List[str] = None, tasks: list[tuple[str, int]] = None, union: bool = True) -> List[str]: """ Return all valid configs. By default, will return all configs that appear in any task at least once. @@ -420,11 +452,11 @@ def get_configs(self, *, datasets: List[str] = None, tasks: List[str] = None, un raise ValueError(f"Invalid datasets specified: {sorted(list(datasets_invalid))}") df = df[df["dataset"].isin(datasets)] if tasks is not None: - tasks_all = set(self.get_tasks()) + tasks_all = set(self.get_tasks(as_dataset_fold=True)) tasks_invalid = set(tasks).difference(tasks_all) if len(tasks_invalid) != 0: raise ValueError(f"Invalid tasks specified: {sorted(list(tasks_invalid))}") - df = df[df["task"].isin(tasks)] + df = df[df.set_index(["dataset", "fold"]).index.isin(tasks)] if len(df) == 0: raise AssertionError(f"No valid results for tasks={tasks} | datasets={datasets}") @@ -632,3 +664,69 @@ def _verify_configs_hyperparameters(cls, configs_hyperparameters: dict[str, dict assert "hyperparameters" in v, f"configs_hyperparameters value for key {config} must include a `hyperparameters` key" assert isinstance(v["hyperparameters"], dict), (f"configs_hyperparameters['{config}']['hyperparameters'] " f"must be of type dict, found: {type(v['hyperparameters'])}") + + def to_dir(self, path: str) -> dict: + path_configs = "configs.parquet" + save_pd.save(path=str(Path(path) / path_configs), df=self.df_configs) + + path_baselines = None + if self.df_baselines is not None: + path_baselines = "baselines.parquet" + save_pd.save(path=str(Path(path) / path_baselines), df=self.df_baselines) + + path_metadata = None + if self.df_metadata is not None: + path_metadata = "task_metadata.parquet" + save_pd.save(path=str(Path(path) / path_metadata), df=self.df_metadata) + + path_configs_hyperparameters = None + if self.configs_hyperparameters is not None: + path_configs_hyperparameters = "configs_hyperparameters.json" + save_json.save(path=str(Path(path) / path_configs_hyperparameters), obj=self.configs_hyperparameters) + + metadata = { + "df_configs": path_configs, + "df_baselines": path_baselines, + "df_metadata": path_metadata, + "configs_hyperparameters": path_configs_hyperparameters, + "pct": self.pct, + "score_against_only_baselines": self.score_against_only_baselines, + } + path_metadata_json = "metadata.json" + save_json.save(path=str(Path(path) / path_metadata_json), obj=metadata) + return metadata + + @classmethod + def from_dir(cls, path: str) -> Self: + path_metadata_json = "metadata.json" + metadata = load_json.load(path=path_metadata_json) + + path_configs = metadata["df_configs"] + df_configs = load_pd.load(str(Path(path) / path_configs)) + + df_baselines = None + path_baselines = metadata["df_baselines"] + if path_baselines is not None: + df_baselines = load_pd.load(str(Path(path) / path_baselines)) + + df_metadata = None + path_metadata = metadata["df_metadata"] + if path_metadata is not None: + df_metadata = load_pd.load(str(Path(path) / path_metadata)) + + configs_hyperparameters = None + path_configs_hyperparameters = metadata["configs_hyperparameters"] + if path_configs_hyperparameters is not None: + configs_hyperparameters = load_json.load(str(Path(path) / path_configs_hyperparameters)) + + pct = metadata["pct"] + score_against_only_baselines = metadata["score_against_only_baselines"] + + return cls( + df_configs=df_configs, + df_baselines=df_baselines, + df_metadata=df_metadata, + configs_hyperparameters=configs_hyperparameters, + pct=pct, + score_against_only_baselines=score_against_only_baselines, + ) diff --git a/tabrepo/utils/cache.py b/tabrepo/utils/cache.py index 1523e18e..3d54d271 100644 --- a/tabrepo/utils/cache.py +++ b/tabrepo/utils/cache.py @@ -19,7 +19,7 @@ def cache_function( fun: Callable[[], object], cache_name: str, ignore_cache: bool = False, - cache_path: Optional[Path] = None + cache_path: Optional[Path | str] = None ): f""" :param fun: a function whose result obtained `fun()` will be cached, the output of the function must be serializable. @@ -30,7 +30,7 @@ def cache_function( """ if cache_path is None: cache_path = default_cache_path - cache_file = cache_path / (cache_name + ".pkl") + cache_file = Path(cache_path) / (cache_name + ".pkl") cache_file.parent.mkdir(parents=True, exist_ok=True) if cache_file.exists() and not ignore_cache: print(f"Loading cache {cache_file}") @@ -48,21 +48,19 @@ def cache_function( def cache_function_dataframe( - fun: Callable[[], pd.DataFrame], - cache_name: str, - ignore_cache: bool = False, - cache_path: Optional[Path] = None -): + fun: Callable[[], pd.DataFrame], + cache_name: str, + cache_path: Path | str, + ignore_cache: bool = False, +) -> pd.DataFrame: f""" :param fun: a function whose dataframe result obtained `fun()` will be cached - :param cache_name: the cache of the function result is written into `{cache_path}/{cache_name}.csv.zip` + :param cache_name: the cache of the function result is written into `{cache_path}/{cache_name}.csv` + :param cache_path: folder where to write cache files :param ignore_cache: whether to recompute even if the cache is present - :param cache_path: folder where to write cache files, default to ~/cache-zeroshot/ :return: result of fun() """ - if cache_path is None: - cache_path = default_cache_path - cache_file = cache_path / (cache_name + ".csv.zip") + cache_file = Path(cache_path) / (cache_name + ".csv") cache_file.parent.mkdir(parents=True, exist_ok=True) if cache_file.exists() and not ignore_cache: print(f"Loading cache {cache_file}") diff --git a/tabrepo/utils/parallel_for.py b/tabrepo/utils/parallel_for.py index 27e75bca..78aa0cfa 100644 --- a/tabrepo/utils/parallel_for.py +++ b/tabrepo/utils/parallel_for.py @@ -7,8 +7,9 @@ def parallel_for( f: Callable[[object], B], inputs: List[Union[list, dict]], - context: dict, + context: dict = None, engine: str = "ray", + progress_bar: bool = True, ) -> List[B]: """ Evaluates an embarrasingly parallel for-loop. @@ -22,10 +23,12 @@ def parallel_for( `[f(x, **context) for x in inputs]`. """ assert engine in ["sequential", "ray", "joblib"] + if context is None: + context = {} if engine == "sequential": return [ f(**x, **context) if isinstance(x, dict) else f(*x, **context) - for x in tqdm(inputs) + for x in tqdm(inputs, disable=not progress_bar) ] if engine == "joblib": from joblib import Parallel, delayed @@ -42,4 +45,4 @@ def remote_f(x, context): return f(**x, **context) if isinstance(x, dict) else f(*x, **context) remote_context = ray.put(context) remote_results = [remote_f.remote(x, remote_context) for x in inputs] - return [ray.get(res) for res in tqdm(remote_results)] + return [ray.get(res) for res in tqdm(remote_results, disable=not progress_bar)] diff --git a/tst/test_cache.py b/tst/test_cache.py index 302a8a63..a0e2070e 100644 --- a/tst/test_cache.py +++ b/tst/test_cache.py @@ -17,5 +17,6 @@ def f(): return pd.DataFrame({"a": [1, 2], "b": [3, 4]}) for ignore_cache in [True, False]: - res = cache_function_dataframe(f, "f", ignore_cache=ignore_cache) + # TODO: Consider using a true tempdir to avoid side-effects, question: how to pass a tempdir as a function argument? + res = cache_function_dataframe(f, "f", cache_path="tmp_cache_dir", ignore_cache=ignore_cache) pd.testing.assert_frame_equal(res, pd.DataFrame({"a": [1, 2], "b": [3, 4]})) diff --git a/tst/test_metrics.py b/tst/test_metrics.py index 7847d167..8f792865 100644 --- a/tst/test_metrics.py +++ b/tst/test_metrics.py @@ -59,7 +59,7 @@ def assert_fast_roc_auc_equivalence(y_true, y_pred, rtol=1e-7): def test_fast_roc_auc_ties(y_true, y_pred): """Ensure fast_roc_auc produces equivalent scores to AutoGluon and Scikit-Learn roc_auc implementations when ties exist""" - y_true = np.array(y_true, dtype=np.bool8) + y_true = np.array(y_true, dtype=np.bool_) y_pred = np.array(y_pred, dtype=np.float32) assert_fast_roc_auc_equivalence(y_true=y_true, y_pred=y_pred) diff --git a/tst/test_repository.py b/tst/test_repository.py index c4001d88..f365aec4 100644 --- a/tst/test_repository.py +++ b/tst/test_repository.py @@ -1,5 +1,5 @@ import copy - +import shutil from typing import Callable import numpy as np @@ -12,7 +12,14 @@ def verify_equivalent_repository( repo1: EvaluationRepository | EvaluationRepositoryCollection, repo2: EvaluationRepository | EvaluationRepositoryCollection, + exact: bool = True, + verify_metrics: bool = True, + verify_predictions: bool = True, verify_ensemble: bool = False, + verify_baselines: bool = True, + verify_metadata: bool = True, + verify_configs_hyperparameters: bool = True, + verify_config_fallback: bool = True, backend: str = "native", ): assert repo1.folds == repo2.folds @@ -20,22 +27,63 @@ def verify_equivalent_repository( assert repo1.configs() == repo2.configs() assert repo1.datasets() == repo2.datasets() assert sorted(repo1.dataset_fold_config_pairs()) == sorted(repo2.dataset_fold_config_pairs()) - for dataset in repo1.datasets(): - for f in repo1.folds: - for c in repo1.configs(): - repo1_test = repo1.predict_test(dataset=dataset, config=c, fold=f) - repo2_test = repo2.predict_test(dataset=dataset, config=c, fold=f) - repo1_val = repo1.predict_val(dataset=dataset, config=c, fold=f) - repo2_val = repo2.predict_val(dataset=dataset, config=c, fold=f) - assert np.array_equal(repo1_test, repo2_test) - assert np.array_equal(repo1_val, repo2_val) - assert np.array_equal(repo1.labels_test(dataset=dataset, fold=f), repo2.labels_test(dataset=dataset, fold=f)) - assert np.array_equal(repo1.labels_val(dataset=dataset, fold=f), repo2.labels_val(dataset=dataset, fold=f)) + if verify_metrics: + metrics1 = repo1.metrics().sort_index() + metrics2 = repo2.metrics().sort_index() + assert metrics1.equals(metrics2) + if verify_config_fallback: + assert repo1._config_fallback == repo2._config_fallback + if verify_predictions: + for dataset in repo1.datasets(): + for f in repo1.folds: + for c in repo1.configs(): + repo1_test = repo1.predict_test(dataset=dataset, config=c, fold=f) + repo2_test = repo2.predict_test(dataset=dataset, config=c, fold=f) + repo1_val = repo1.predict_val(dataset=dataset, config=c, fold=f) + repo2_val = repo2.predict_val(dataset=dataset, config=c, fold=f) + if exact: + assert np.array_equal(repo1_test, repo2_test) + assert np.array_equal(repo1_val, repo2_val) + else: + assert np.isclose(repo1_test, repo2_test).all() + assert np.isclose(repo1_val, repo2_val).all() + if exact: + assert np.array_equal(repo1.labels_test(dataset=dataset, fold=f), repo2.labels_test(dataset=dataset, fold=f)) + assert np.array_equal(repo1.labels_val(dataset=dataset, fold=f), repo2.labels_val(dataset=dataset, fold=f)) + else: + assert np.isclose(repo1.labels_test(dataset=dataset, fold=f), repo2.labels_test(dataset=dataset, fold=f)).all() + assert np.isclose(repo1.labels_val(dataset=dataset, fold=f), repo2.labels_val(dataset=dataset, fold=f)).all() if verify_ensemble: - df_out_1, df_ensemble_weights_1 = repo1.evaluate_ensemble(datasets=repo1.datasets(), ensemble_size=10, backend=backend) - df_out_2, df_ensemble_weights_2 = repo2.evaluate_ensemble(datasets=repo2.datasets(), ensemble_size=10, backend=backend) + df_out_1, df_ensemble_weights_1 = repo1.evaluate_ensembles(datasets=repo1.datasets(), ensemble_size=10, backend=backend) + df_out_2, df_ensemble_weights_2 = repo2.evaluate_ensembles(datasets=repo2.datasets(), ensemble_size=10, backend=backend) assert df_out_1.equals(df_out_2) assert df_ensemble_weights_1.equals(df_ensemble_weights_2) + if verify_baselines: + baselines1 = repo1._zeroshot_context.df_baselines + baselines2 = repo2._zeroshot_context.df_baselines + if baselines1 is not None: + columns1 = sorted(list(baselines1.columns)) + columns2 = sorted(list(baselines2.columns)) + assert columns1 == columns2 + baselines1 = baselines1[columns1].sort_values(by=columns1, ignore_index=True) + baselines2 = baselines2[columns1].sort_values(by=columns1, ignore_index=True) + assert baselines1.equals(baselines2) + else: + assert baselines1 == baselines2 + if verify_metadata: + metadata1 = repo1.task_metadata + metadata2 = repo2.task_metadata + if metadata1 is None: + assert metadata1 == metadata2 + else: + columns1 = sorted(list(metadata1.columns)) + columns2 = sorted(list(metadata2.columns)) + assert columns1 == columns2 + metadata1 = metadata1[columns1].sort_values(by=columns1, ignore_index=True) + metadata2 = metadata2[columns1].sort_values(by=columns1, ignore_index=True) + assert metadata1.equals(metadata2) + if verify_configs_hyperparameters: + assert repo1.configs_hyperparameters() == repo2.configs_hyperparameters() def test_repository(): @@ -60,8 +108,8 @@ def test_repository(): assert repo.labels_val(dataset=dataset, fold=2).shape == (123,) assert repo.labels_test(dataset=dataset, fold=2).shape == (13,) assert repo.dataset_metadata(dataset=dataset) == {'dataset': dataset, 'task_type': 'TaskType.SUPERVISED_CLASSIFICATION'} - result_errors, result_ensemble_weights = repo.evaluate_ensemble(datasets=[dataset], configs=[config, config], ensemble_size=5, backend="native") - assert result_errors.shape == (3,) + result_errors, result_ensemble_weights = repo.evaluate_ensembles(datasets=[dataset], configs=[config, config], ensemble_size=5, backend="native") + assert result_errors.shape == (3, 7) assert len(result_ensemble_weights) == 3 dataset_info = repo.dataset_info(dataset=dataset) @@ -73,15 +121,15 @@ def test_repository(): assert np.allclose(result_ensemble_weights.loc[(dataset, 0)], [1.0, 0.0]) # Test `max_models_per_type` - result_errors_w_max_models, result_ensemble_weights_w_max_models = repo.evaluate_ensemble( + result_errors_w_max_models, result_ensemble_weights_w_max_models = repo.evaluate_ensembles( datasets=[dataset], configs=[config, config], ensemble_size=5, backend="native", ensemble_kwargs={"max_models_per_type": 1} ) - assert result_errors_w_max_models.shape == (3,) + assert result_errors_w_max_models.shape == (3, 7) assert len(result_ensemble_weights_w_max_models) == 3 assert np.allclose(result_ensemble_weights_w_max_models.loc[(dataset, 0)], [1.0, 0.0]) - assert repo.evaluate_ensemble(datasets=[dataset], configs=[config, config], - ensemble_size=5, folds=[2], backend="native")[0].shape == (1,) + assert repo.evaluate_ensembles(datasets=[dataset], configs=[config, config], + ensemble_size=5, folds=[2], backend="native")[0].shape == (1, 7) repo: EvaluationRepository = repo.subset(folds=[0, 2]) assert repo.datasets() == ['abalone', 'ada'] @@ -93,9 +141,9 @@ def test_repository(): assert repo.predict_test(dataset=dataset, config=config, fold=2).shape == (13, 25) assert repo.dataset_metadata(dataset=dataset) == {'dataset': dataset, 'task_type': 'TaskType.SUPERVISED_CLASSIFICATION'} # result_errors, result_ensemble_weights = repo.evaluate_ensemble(datasets=[dataset], configs=[config, config], ensemble_size=5, backend="native")[0], - assert repo.evaluate_ensemble(datasets=[dataset], configs=[config, config], ensemble_size=5, backend="native")[0].shape == (2,) - assert repo.evaluate_ensemble(datasets=[dataset], configs=[config, config], - ensemble_size=5, folds=[2], backend="native")[0].shape == (1,) + assert repo.evaluate_ensembles(datasets=[dataset], configs=[config, config], ensemble_size=5, backend="native")[0].shape == (2, 7) + assert repo.evaluate_ensembles(datasets=[dataset], configs=[config, config], ensemble_size=5, folds=[2], backend="native")[0].shape == (1, 7) + assert repo.evaluate_ensemble(dataset=dataset, fold=2, configs=[config, config], ensemble_size=5)[0].shape == (1, 7) repo: EvaluationRepository = repo.subset(folds=[2], datasets=[dataset], configs=[config]) assert repo.datasets() == ['abalone'] @@ -106,10 +154,9 @@ def test_repository(): assert repo.predict_val(dataset=dataset, config=config, fold=2).shape == (123, 25) assert repo.predict_test(dataset=dataset, config=config, fold=2).shape == (13, 25) assert repo.dataset_metadata(dataset=dataset) == {'dataset': dataset, 'task_type': 'TaskType.SUPERVISED_CLASSIFICATION'} - assert repo.evaluate_ensemble(datasets=[dataset], configs=[config, config], ensemble_size=5, backend="native")[0].shape == (1,) + assert repo.evaluate_ensembles(datasets=[dataset], configs=[config, config], ensemble_size=5, backend="native")[0].shape == (1, 7) - assert repo.evaluate_ensemble(datasets=[dataset], configs=[config, config], - ensemble_size=5, folds=[2], backend="native")[0].shape == (1,) + assert repo.evaluate_ensembles(datasets=[dataset], configs=[config, config], ensemble_size=5, folds=[2], backend="native")[0].shape == (1, 7) def test_repository_force_to_dense(): @@ -186,7 +233,10 @@ def test_repository_subset(): def test_repository_configs_hyperparameters(): repo1 = load_repo_artificial() repo2 = load_repo_artificial(include_hyperparameters=True) - verify_equivalent_repository(repo1, repo2, verify_ensemble=True) + verify_equivalent_repository(repo1, repo2, verify_ensemble=True, verify_configs_hyperparameters=False) + + with pytest.raises(AssertionError): + verify_equivalent_repository(repo1, repo2, verify_configs_hyperparameters=True) configs = ['NeuralNetFastAI_r1', 'NeuralNetFastAI_r2'] @@ -242,6 +292,89 @@ def test_repository_configs_hyperparameters(): ]} +def test_repository_save_load(): + """test repo save and load work""" + repo = load_repo_artificial(include_hyperparameters=True) + save_path = "tmp_repo" + repo.to_dir(path=save_path) + repo_loaded = EvaluationRepository.from_dir(path=save_path) + verify_equivalent_repository(repo1=repo, repo2=repo_loaded, verify_ensemble=True, exact=True) + + repo_float64 = load_repo_artificial(include_hyperparameters=True, dtype=np.float64) + save_path = "tmp_repo_from_float64" + repo_float64.to_dir(path=save_path) + repo_loaded_float64 = EvaluationRepository.from_dir(path=save_path) + # exact=False because the loaded version is float32 and the original is float64 + verify_equivalent_repository(repo1=repo_float64, repo2=repo_loaded_float64, verify_ensemble=True, exact=False) + + +def test_repository_save_load_with_moving_files(): + """test repo save and load work when moving files to different directories""" + + save_path = "tmp_repo" + copy_path = "tmp_repo_copy" + shutil.rmtree(save_path, ignore_errors=True) + shutil.rmtree(copy_path, ignore_errors=True) + + repo = load_repo_artificial(include_hyperparameters=True) + repo.set_config_fallback(config_fallback=repo.configs()[0]) + + assert repo._config_fallback == repo.configs()[0] + with pytest.raises(AssertionError): + repo.predict_test(dataset="abalone", fold=0, config=repo.configs()[0]) + + repo.to_dir(path=save_path) + repo_loaded = EvaluationRepository.from_dir(path=save_path) + repo_loaded_mem = EvaluationRepository.from_dir(path=save_path, prediction_format="mem") + repo_loaded_memopt = EvaluationRepository.from_dir(path=save_path, prediction_format="memopt") + + assert repo._config_fallback == repo_loaded_mem._config_fallback + assert repo._config_fallback == repo_loaded_memopt._config_fallback + + repo_loaded.predict_test(dataset="abalone", fold=0, config=repo_loaded.configs()[0]) + with pytest.raises(AssertionError): + repo_loaded_mem.predict_test(dataset="abalone", fold=0, config=repo_loaded.configs()[0]) + with pytest.raises(AssertionError): + repo_loaded_memopt.predict_test(dataset="abalone", fold=0, config=repo_loaded.configs()[0]) + repo.set_config_fallback(None) + repo_loaded_mem.set_config_fallback(None) + repo_loaded_memopt.set_config_fallback(None) + + assert repo_loaded_mem._config_fallback is None + assert repo_loaded_memopt._config_fallback is None + + repo_loaded_mem.predict_test(dataset="abalone", fold=0, config=repo_loaded.configs()[0]) + repo_loaded_memopt.predict_test(dataset="abalone", fold=0, config=repo_loaded.configs()[0]) + + verify_equivalent_repository(repo1=repo, repo2=repo_loaded, verify_ensemble=True, exact=True, verify_config_fallback=False) + verify_equivalent_repository(repo1=repo, repo2=repo_loaded_mem, verify_ensemble=True, exact=True) + verify_equivalent_repository(repo1=repo, repo2=repo_loaded_memopt, verify_ensemble=True, exact=True) + + shutil.copytree(save_path, copy_path) + + repo_loaded_copy = EvaluationRepository.from_dir(path=copy_path) + verify_equivalent_repository(repo1=repo_loaded, repo2=repo_loaded_copy, verify_ensemble=True, exact=True) + + # verify that the original stops working after deleting the original files + repo_loaded.predict_test(dataset="abalone", fold=0, config=repo_loaded.configs()[0]) + shutil.rmtree(save_path) + with pytest.raises(FileNotFoundError): + repo_loaded.predict_test(dataset="abalone", fold=0, config=repo_loaded.configs()[0]) + + # verify in-memory repos don't require the original files + verify_equivalent_repository(repo1=repo, repo2=repo_loaded_mem, verify_ensemble=True, exact=True) + verify_equivalent_repository(repo1=repo, repo2=repo_loaded_memopt, verify_ensemble=True, exact=True) + + # verify that the copy works even after deleting the original files + verify_equivalent_repository(repo1=repo, repo2=repo_loaded_copy, verify_ensemble=True, exact=True, verify_config_fallback=False) + + # verify that the copy stops working after deleting the copied files + repo_loaded_copy.predict_test(dataset="abalone", fold=0, config=repo_loaded_copy.configs()[0]) + shutil.rmtree(copy_path) + with pytest.raises(FileNotFoundError): + repo_loaded_copy.predict_test(dataset="abalone", fold=0, config=repo_loaded_copy.configs()[0]) + + def _assert_predict_multi_binary_as_multiclass(repo, fun: Callable, dataset, configs, n_rows, n_classes): problem_type = repo.dataset_info(dataset=dataset)["problem_type"] predict_multi = fun(dataset=dataset, fold=2, configs=configs) diff --git a/tst/test_repository_utils.py b/tst/test_repository_utils.py index 5de7339c..76e7b74c 100644 --- a/tst/test_repository_utils.py +++ b/tst/test_repository_utils.py @@ -7,11 +7,10 @@ def test_get_runtime(): - config_names = repo.configs() runtime_dict = get_runtime( repo, - tid=359944, + dataset="ada", fold=1, config_names=config_names, ) @@ -23,7 +22,7 @@ def test_get_runtime_time_infer_s(): config_names = repo.configs() runtime_dict = get_runtime( repo, - tid=359944, + dataset="ada", fold=1, config_names=config_names, runtime_col='time_infer_s', @@ -50,7 +49,7 @@ def test_filter_configs_by_runtime(): ]: selected_configs = filter_configs_by_runtime( repo, - tid=359944, + dataset="ada", fold=1, config_names=config_names, max_cumruntime=max_cumruntime,