Source code for h2o.estimators.gbm

from .estimator_base import *


[docs]class H2OGradientBoostingEstimator(H2OEstimator): """ Builds gradient boosted classification trees, and gradient boosted regression trees on a parsed data set. The default distribution function will guess the model type based on the response column type run properly the response column must be an numeric for "gaussian" or an enum for "bernoulli" or "multinomial". Parameters ---------- model_id : str, optional The unique id assigned to the resulting model. If none is given, an id will automatically be generated. distribution : str The distribution function of the response. Must be "AUTO", "bernoulli", "multinomial", "poisson", "gamma", "tweedie", "laplace", "quantile" or "gaussian" quantile_alpha : float Quantile (only for Quantile regression, must be between 0 and 1) tweedie_power : float Tweedie power (only for Tweedie distribution, must be between 1 and 2) ntrees : int A non-negative integer that determines the number of trees to grow. max_depth : int Maximum depth to grow the tree. min_rows : int Minimum number of rows to assign to terminal nodes. learn_rate : float Learning rate (from 0.0 to 1.0) learn_rate_annealing : float Multiply the learning rate by this factor after every tree sample_rate : float Row sample rate per tree (from 0.0 to 1.0) sample_rate_per_class : list Row sample rate per tree per class (one per class, from 0.0 to 1.0) col_sample_rate : float Column sample rate per split (from 0.0 to 1.0) col_sample_rate_change_per_level : float Relative change of the column sampling rate for every level (from 0.0 to 2.0) col_sample_rate_per_tree : float Column sample rate per tree (from 0.0 to 1.0) nbins : int For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point. nbins_top_level : int For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level. nbins_cats : int For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting. balance_classes : bool logical, indicates whether or not to balance training data class counts via over/under-sampling (for imbalanced data) class_sampling_factors : list Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes. max_after_balance_size : float Maximum relative size of the training data after balancing class counts (can be less than 1.0). Ignored if balance_classes is False, which is the default behavior. seed : int Seed for random numbers (affects sampling when balance_classes=T) build_tree_one_node : bool Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets. nfolds : int, optional Number of folds for cross-validation. If nfolds >= 2, then validation must remain empty. fold_assignment : str Cross-validation fold assignment scheme, if fold_column is not specified, must be "AUTO", "Random", "Modulo", or "Stratified". The Stratified option will stratify the folds based on the response variable, for classification problems. keep_cross_validation_predictions : bool Whether to keep the predictions of the cross-validation models keep_cross_validation_fold_assignment : bool Whether to keep the cross-validation fold assignment. score_each_iteration : bool Attempts to score each tree. score_tree_interval : int Score the model after every so many trees. Disabled if set to 0. stopping_rounds : int Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve (by stopping_tolerance) for k=stopping_rounds scoring events. Can only trigger after at least 2k scoring events. Use 0 to disable. stopping_metric : str Metric to use for convergence checking, only for _stopping_rounds > 0 Can be one of "AUTO", "deviance", "logloss", "MSE", "AUC", "r2", "misclassification". stopping_tolerance : float Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) min_split_improvement : float Minimum relative improvement in squared error reduction for a split to happen random_split_points : boolean Whether to use random split points for histograms (to pick the best split from). max_abs_leafnode_pred : float Maximum absolute value of a leaf node prediction. Returns ------- A new H2OGradientBoostedEstimator object. """ def __init__(self, model_id=None, distribution=None, quantile_alpha=None, tweedie_power=None, ntrees=None, max_depth=None, min_rows=None, learn_rate=None, nbins=None, sample_rate=None, sample_rate_per_class=None, col_sample_rate=None, col_sample_rate_change_per_level=None, col_sample_rate_per_tree=None, nbins_top_level=None, nbins_cats=None, balance_classes=None, class_sampling_factors=None, max_after_balance_size=None, seed=None, build_tree_one_node=None, nfolds=None, fold_assignment=None, keep_cross_validation_predictions=None, keep_cross_validation_fold_assignment=None, stopping_rounds=None, stopping_metric=None, stopping_tolerance=None, score_each_iteration=None, score_tree_interval=None, checkpoint=None, min_split_improvement=None, random_split_points=None, max_abs_leafnode_pred=None): super(H2OGradientBoostingEstimator, self).__init__() self._parms = locals() self._parms = {k:v for k,v in self._parms.items() if k!="self"} @property def distribution(self): return self._parms["distribution"] @distribution.setter def distribution(self, value): self._parms["distribution"] = value @property def quantile_alpha(self): return self._parms["quantile_alpha"] @quantile_alpha.setter def quantile_alpha(self, value): self._parms["quantile_alpha"] = value @property def tweedie_power(self): return self._parms["tweedie_power"] @tweedie_power.setter def tweedie_power(self, value): self._parms["tweedie_power"] = value @property def ntrees(self): return self._parms["ntrees"] @ntrees.setter def ntrees(self, value): self._parms["ntrees"] = value @property def max_depth(self): return self._parms["max_depth"] @max_depth.setter def max_depth(self, value): self._parms["max_depth"] = value @property def min_rows(self): return self._parms["min_rows"] @min_rows.setter def min_rows(self, value): self._parms["min_rows"] = value @property def learn_rate(self): return self._parms["learn_rate"] @learn_rate.setter def learn_rate(self, value): self._parms["learn_rate"] = value @property def learn_rate_annealing(self): return self._parms["learn_rate_annealing"] @learn_rate_annealing.setter def learn_rate_annealing(self, value): self._parms["learn_rate_annealing"] = value @property def sample_rate(self): return self._parms["sample_rate"] @sample_rate.setter def sample_rate(self, value): self._parms["sample_rate"] = value @property def sample_rate_per_class(self): return self._parms["sample_rate_per_class"] @sample_rate_per_class.setter def sample_rate_per_class(self, value): self._parms["sample_rate_per_class"] = value @property def col_sample_rate(self): return self._parms["col_sample_rate"] @col_sample_rate.setter def col_sample_rate(self, value): self._parms["col_sample_rate"] = value @property def col_sample_rate_change_per_tree(self): return self._parms["col_sample_rate_change_per_tree"] @col_sample_rate_change_per_tree.setter def col_sample_rate_change_per_tree(self, value): self._parms["col_sample_rate_change_per_tree"] = value @property def col_sample_rate_per_tree(self): return self._parms["col_sample_rate_per_tree"] @col_sample_rate_per_tree.setter def col_sample_rate_per_tree(self, value): self._parms["col_sample_rate_per_tree"] = value @property def nbins(self): return self._parms["nbins"] @nbins.setter def nbins(self, value): self._parms["nbins"] = value @property def nbins_top_level(self): return self._parms["nbins_top_level"] @nbins_top_level.setter def nbins_top_level(self, value): self._parms["nbins_top_level"] = value @property def nbins_cats(self): return self._parms["nbins_cats"] @nbins_cats.setter def nbins_cats(self, value): self._parms["nbins_cats"] = value @property def balance_classes(self): return self._parms["balance_classes"] @balance_classes.setter def balance_classes(self, value): self._parms["balance_classes"] = value @property def class_sampling_factors(self): return self._parms["class_sampling_factors"] @class_sampling_factors.setter def class_sampling_factors(self, value): self._parms["class_sampling_factors"] = value @property def max_after_balance_size(self): return self._parms["max_after_balance_size"] @max_after_balance_size.setter def max_after_balance_size(self, value): self._parms["max_after_balance_size"] = value @property def seed(self): return self._parms["seed"] @seed.setter def seed(self, value): self._parms["seed"] = value @property def build_tree_one_node(self): return self._parms["build_tree_one_node"] @build_tree_one_node.setter def build_tree_one_node(self, value): self._parms["build_tree_one_node"] = value @property def nfolds(self): return self._parms["nfolds"] @nfolds.setter def nfolds(self, value): self._parms["nfolds"] = value @property def fold_assignment(self): return self._parms["fold_assignment"] @fold_assignment.setter def fold_assignment(self, value): self._parms["fold_assignment"] = value @property def keep_cross_validation_predictions(self): return self._parms["keep_cross_validation_predictions"] @keep_cross_validation_predictions.setter def keep_cross_validation_predictions(self, value): self._parms["keep_cross_validation_predictions"] = value @property def keep_cross_validation_fold_assignment(self): return self._parms["keep_cross_validation_fold_assignment"] @keep_cross_validation_fold_assignment.setter def keep_cross_validation_fold_assignment(self, value): self._parms["keep_cross_validation_fold_assignment"] = value @property def score_each_iteration(self): return self._parms["score_each_iteration"] @score_each_iteration.setter def score_each_iteration(self, value): self._parms["score_each_iteration"] = value @property def score_tree_interval(self): return self._parms["score_tree_interval"] @score_tree_interval.setter def score_tree_interval(self, value): self._parms["score_tree_interval"] = value @property def stopping_rounds(self): return self._parms["stopping_rounds"] @stopping_rounds.setter def stopping_rounds(self, value): self._parms["stopping_rounds"] = value @property def stopping_metric(self): return self._parms["stopping_metric"] @stopping_metric.setter def stopping_metric(self, value): self._parms["stopping_metric"] = value @property def stopping_tolerance(self): return self._parms["stopping_tolerance"] @stopping_tolerance.setter def stopping_tolerance(self, value): self._parms["stopping_tolerance"] = value @property def checkpoint(self): return self._parms["checkpoint"] @checkpoint.setter def checkpoint(self, value): self._parms["checkpoint"] = value @property def min_split_improvement(self): return self._parms["min_split_improvement"] @min_split_improvement.setter def min_split_improvement(self, value): self._parms["min_split_improvement"] = value @property def random_split_points(self): return self._parms["random_split_points"] @random_split_points.setter def random_split_points(self, value): self._parms["random_split_points"] = value @property def max_abs_leafnode_pred(self): return self._parms["max_abs_leafnode_pred"] @max_abs_leafnode_pred.setter def max_abs_leafnode_pred(self, value): self._parms["max_abs_leafnode_pred"] = value