diff --git a/m2cgen/assemblers/boosting.py b/m2cgen/assemblers/boosting.py index 2f6f090f..e66b91ee 100644 --- a/m2cgen/assemblers/boosting.py +++ b/m2cgen/assemblers/boosting.py @@ -9,10 +9,12 @@ class BaseBoostingAssembler(ModelAssembler): classifier_name = None - def __init__(self, model, trees, base_score=0, tree_limit=None): + def __init__(self, model, trees, base_score=0, tree_limit=None, + leaves_cutoff_threshold=3000): super().__init__(model) self.all_trees = trees self._base_score = base_score + self._leaves_cutoff_threshold = leaves_cutoff_threshold self._output_size = 1 self._is_classification = False @@ -41,10 +43,19 @@ def _assemble_single_output(self, trees, base_score=0): trees = trees[:self._tree_limit] trees_ast = [self._assemble_tree(t) for t in trees] + to_sum = trees_ast + + # In a large tree we need to generate multiple subroutines to avoid + # java limitations https://github.com/BayesWitnesses/m2cgen/issues/103. + trees_num_leaves = [self._count_leaves(t) for t in trees] + if sum(trees_num_leaves) > self._leaves_cutoff_threshold: + to_sum = self._split_into_subroutines(trees_ast, trees_num_leaves) + result_ast = utils.apply_op_to_expressions( ast.BinNumOpType.ADD, ast.NumVal(base_score), - *trees_ast) + *to_sum) + return ast.SubroutineExpr(result_ast) def _assemble_multi_class_output(self, trees): @@ -74,15 +85,47 @@ def _assemble_bin_class_output(self, trees): proba_expr ]) + def _split_into_subroutines(self, trees_ast, trees_num_leaves): + result = [] + subroutine_trees = [] + subroutine_sum_leaves = 0 + for tree, num_leaves in zip(trees_ast, trees_num_leaves): + next_sum = subroutine_sum_leaves + num_leaves + if subroutine_trees and next_sum > self._leaves_cutoff_threshold: + # Exceeded the max leaves in the current subroutine, + # finalize this one and start a new one. + partial_result = utils.apply_op_to_expressions( + ast.BinNumOpType.ADD, + *subroutine_trees) + + result.append(ast.SubroutineExpr(partial_result)) + + subroutine_trees = [] + subroutine_sum_leaves = 0 + + subroutine_sum_leaves += num_leaves + subroutine_trees.append(tree) + + if subroutine_trees: + partial_result = utils.apply_op_to_expressions( + ast.BinNumOpType.ADD, + *subroutine_trees) + result.append(ast.SubroutineExpr(partial_result)) + return result + def _assemble_tree(self, tree): raise NotImplementedError + @staticmethod + def _count_leaves(trees): + raise NotImplementedError + class XGBoostModelAssembler(BaseBoostingAssembler): classifier_name = "XGBClassifier" - def __init__(self, model): + def __init__(self, model, leaves_cutoff_threshold=3000): feature_names = model.get_booster().feature_names self._feature_name_to_idx = { name: idx for idx, name in enumerate(feature_names or []) @@ -96,7 +139,8 @@ def __init__(self, model): best_ntree_limit = getattr(model, "best_ntree_limit", None) super().__init__(model, trees, base_score=model.base_score, - tree_limit=best_ntree_limit) + tree_limit=best_ntree_limit, + leaves_cutoff_threshold=leaves_cutoff_threshold) def _assemble_tree(self, tree): if "leaf" in tree: @@ -130,16 +174,31 @@ def _assemble_child_tree(self, tree, child_id): return self._assemble_tree(child) assert False, "Unexpected child ID {}".format(child_id) + @staticmethod + def _count_leaves(tree): + queue = [tree] + num_leaves = 0 + + while queue: + tree = queue.pop() + if "leaf" in tree: + num_leaves += 1 + elif "children" in tree: + for child in tree["children"]: + queue.append(child) + return num_leaves + class LightGBMModelAssembler(BaseBoostingAssembler): classifier_name = "LGBMClassifier" - def __init__(self, model): + def __init__(self, model, leaves_cutoff_threshold=3000): model_dump = model.booster_.dump_model() trees = [m["tree_structure"] for m in model_dump["tree_info"]] - super().__init__(model, trees) + super().__init__(model, trees, + leaves_cutoff_threshold=leaves_cutoff_threshold) def _assemble_tree(self, tree): if "leaf_value" in tree: @@ -151,9 +210,9 @@ def _assemble_tree(self, tree): op = ast.CompOpType.from_str_op(tree["decision_type"]) assert op == ast.CompOpType.LTE, "Unexpected comparison op" - # Make sure that if the 'default_left' is true the left tree branch + # Make sure that if the "default_left" is true the left tree branch # ends up in the "else" branch of the ast.IfExpr. - if tree['default_left']: + if tree["default_left"]: op = ast.CompOpType.GT true_child = tree["right_child"] false_child = tree["left_child"] @@ -166,6 +225,20 @@ def _assemble_tree(self, tree): self._assemble_tree(true_child), self._assemble_tree(false_child)) + @staticmethod + def _count_leaves(tree): + queue = [tree] + num_leaves = 0 + + while queue: + tree = queue.pop() + if "leaf_value" in tree: + num_leaves += 1 + else: + queue.append(tree["left_child"]) + queue.append(tree["right_child"]) + return num_leaves + def _split_trees_by_classes(trees, n_classes): # Splits are computed based on a comment diff --git a/tests/assemblers/test_lightgbm.py b/tests/assemblers/test_lightgbm.py index 36303c3b..8fcd9df1 100644 --- a/tests/assemblers/test_lightgbm.py +++ b/tests/assemblers/test_lightgbm.py @@ -110,3 +110,53 @@ def test_regression(): ast.BinNumOpType.ADD)) assert utils.cmp_exprs(actual, expected) + + +def test_leaves_cutoff_threshold(): + estimator = lightgbm.LGBMClassifier(n_estimators=2, random_state=1, + max_depth=1) + utils.train_model_classification_binary(estimator) + + assembler = assemblers.LightGBMModelAssembler(estimator, + leaves_cutoff_threshold=1) + actual = assembler.assemble() + + sigmoid = ast.BinNumExpr( + ast.NumVal(1), + ast.BinNumExpr( + ast.NumVal(1), + ast.ExpExpr( + ast.BinNumExpr( + ast.NumVal(0), + ast.SubroutineExpr( + ast.BinNumExpr( + ast.BinNumExpr( + ast.NumVal(0), + ast.SubroutineExpr( + ast.IfExpr( + ast.CompExpr( + ast.FeatureRef(23), + ast.NumVal(868.2000000000002), + ast.CompOpType.GT), + ast.NumVal(0.2762557140263451), + ast.NumVal(0.6399134166614473))), + ast.BinNumOpType.ADD), + ast.SubroutineExpr( + ast.IfExpr( + ast.CompExpr( + ast.FeatureRef(27), + ast.NumVal(0.14205000000000004), + ast.CompOpType.GT), + ast.NumVal(-0.2139321843285849), + ast.NumVal(0.1151466338793227))), + ast.BinNumOpType.ADD)), + ast.BinNumOpType.SUB)), + ast.BinNumOpType.ADD), + ast.BinNumOpType.DIV, + to_reuse=True) + + expected = ast.VectorVal([ + ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB), + sigmoid]) + + assert utils.cmp_exprs(actual, expected) diff --git a/tests/assemblers/test_xgboost.py b/tests/assemblers/test_xgboost.py index 7195bb0d..7fc5c659 100644 --- a/tests/assemblers/test_xgboost.py +++ b/tests/assemblers/test_xgboost.py @@ -268,3 +268,53 @@ def test_regression_saved_without_feature_names(): ast.BinNumOpType.ADD)) assert utils.cmp_exprs(actual, expected) + + +def test_leaves_cutoff_threshold(): + estimator = xgboost.XGBClassifier(n_estimators=2, random_state=1, + max_depth=1) + utils.train_model_classification_binary(estimator) + + assembler = assemblers.XGBoostModelAssembler(estimator, + leaves_cutoff_threshold=1) + actual = assembler.assemble() + + sigmoid = ast.BinNumExpr( + ast.NumVal(1), + ast.BinNumExpr( + ast.NumVal(1), + ast.ExpExpr( + ast.BinNumExpr( + ast.NumVal(0), + ast.SubroutineExpr( + ast.BinNumExpr( + ast.BinNumExpr( + ast.NumVal(-0.0), + ast.SubroutineExpr( + ast.IfExpr( + ast.CompExpr( + ast.FeatureRef(20), + ast.NumVal(16.7950001), + ast.CompOpType.GTE), + ast.NumVal(-0.17062147), + ast.NumVal(0.1638484))), + ast.BinNumOpType.ADD), + ast.SubroutineExpr( + ast.IfExpr( + ast.CompExpr( + ast.FeatureRef(27), + ast.NumVal(0.142349988), + ast.CompOpType.GTE), + ast.NumVal(-0.16087772), + ast.NumVal(0.149866998))), + ast.BinNumOpType.ADD)), + ast.BinNumOpType.SUB)), + ast.BinNumOpType.ADD), + ast.BinNumOpType.DIV, + to_reuse=True) + + expected = ast.VectorVal([ + ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB), + sigmoid]) + + assert utils.cmp_exprs(actual, expected) diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py index 8b64bb4a..1e1c1b2b 100644 --- a/tests/e2e/test_e2e.py +++ b/tests/e2e/test_e2e.py @@ -51,6 +51,30 @@ def classification_binary(model): ) +def regression_random(model): + return ( + model, + utils.train_model_regression_random_data, + REGRESSION, + ) + + +def classification_random(model): + return ( + model, + utils.train_model_classification_random_data, + CLASSIFICATION, + ) + + +def classification_binary_random(model): + return ( + model, + utils.train_model_classification_binary_random_data, + CLASSIFICATION, + ) + + # Absolute tolerance. Used in np.isclose to compare 2 values. # We compare 6 decimal digits. ATOL = 1.e-6 @@ -63,6 +87,11 @@ def classification_binary(model): LIGHT_GBM_PARAMS = dict(n_estimators=10, random_state=RANDOM_SEED) SVC_PARAMS = dict(random_state=RANDOM_SEED, decision_function_shape="ovo") +XGBOOST_PARAMS_LARGE = dict(base_score=0.6, n_estimators=100, max_depth=12, + random_state=RANDOM_SEED) +LIGHT_GBM_PARAMS_LARGE = dict(n_estimators=100, num_leaves=100, max_depth=64, + random_state=RANDOM_SEED) + @utils.cartesian_e2e_params( # These are the languages which support all models specified in the @@ -85,11 +114,27 @@ def classification_binary(model): classification(lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS)), classification_binary(lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS)), + # LightGBM (Large Trees) + regression_random( + lightgbm.LGBMRegressor(**LIGHT_GBM_PARAMS_LARGE)), + classification_random( + lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS_LARGE)), + classification_binary_random( + lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS_LARGE)), + # XGBoost regression(xgboost.XGBRegressor(**XGBOOST_PARAMS)), classification(xgboost.XGBClassifier(**XGBOOST_PARAMS)), classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS)), + # XGBoost (Large Trees) + regression_random( + xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)), + classification_random( + xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)), + classification_binary_random( + xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)), + # Linear SVM regression(svm.LinearSVR(random_state=RANDOM_SEED)), classification(svm.LinearSVC(random_state=RANDOM_SEED)), diff --git a/tests/utils.py b/tests/utils.py index 5e736791..575c8206 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -75,8 +75,41 @@ def train_model_classification_binary(estimator, test_fraction=0.1): test_fraction) +def train_model_regression_random_data(estimator, test_fraction=0.01): + np.random.seed(seed=7) + N = 1000 + data = np.random.random(size=(N, 200)) + target = np.random.random(size=(N, 1)) + + return _train_model(estimator, (data, target), test_fraction) + + +def train_model_classification_random_data(estimator, test_fraction=0.01): + np.random.seed(seed=7) + N = 1000 + + data = np.random.random(size=(N, 200)) + target = np.random.randint(3, size=(N,)) + + return _train_model(estimator, (data, target), test_fraction) + + +def train_model_classification_binary_random_data(estimator, + test_fraction=0.01): + np.random.seed(seed=7) + N = 1000 + + data = np.random.random(size=(N, 200)) + target = np.random.randint(2, size=(N,)) + + return _train_model(estimator, (data, target), test_fraction) + + def _train_model(estimator, dataset, test_fraction): - X, y = shuffle(dataset.data, dataset.target, random_state=13) + if isinstance(dataset, tuple): + X, y = dataset + else: + X, y = shuffle(dataset.data, dataset.target, random_state=13) offset = int(X.shape[0] * (1 - test_fraction)) X_train, y_train = X[:offset], y[:offset] @@ -120,8 +153,15 @@ def verify_python_model_is_expected(model_code, input, expected_output): def predict_from_commandline(exec_args): - result = subprocess.Popen(exec_args, stdout=subprocess.PIPE) - items = result.stdout.read().decode("utf-8").strip().split(" ") + result = subprocess.Popen(exec_args, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = result.communicate() + if result.returncode is not 0: + raise Exception("bad exit code ({}) stderr: {}".format( + result.returncode, stderr.decode("utf-8"))) + + items = stdout.decode("utf-8").strip().split(" ") + if len(items) == 1: return float(items[0]) else: