#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details)
#
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric
[docs]class H2OGradientBoostingEstimator(H2OEstimator):
"""
Gradient Boosting Machine
Builds gradient boosted trees on a parsed data set, for regression or classification.
The default distribution function will guess the model type based on the response column type.
Otherwise, the response column must be an enum for "bernoulli" or "multinomial", and numeric
for all other distributions.
"""
algo = "gbm"
supervised_learning = True
_options_ = {'model_extensions': ['h2o.model.extensions.ScoringHistoryTrees',
'h2o.model.extensions.VariableImportance',
'h2o.model.extensions.FeatureInteraction',
'h2o.model.extensions.Trees',
'h2o.model.extensions.SupervisedTrees',
'h2o.model.extensions.HStatistic',
'h2o.model.extensions.Contributions',
'h2o.model.extensions.Fairness',
'h2o.model.extensions.RowToTreeAssignment'],
'verbose': True}
def __init__(self,
model_id=None, # type: Optional[Union[None, str, H2OEstimator]]
training_frame=None, # type: Optional[Union[None, str, H2OFrame]]
validation_frame=None, # type: Optional[Union[None, str, H2OFrame]]
nfolds=0, # type: int
keep_cross_validation_models=True, # type: bool
keep_cross_validation_predictions=False, # type: bool
keep_cross_validation_fold_assignment=False, # type: bool
score_each_iteration=False, # type: bool
score_tree_interval=0, # type: int
fold_assignment="auto", # type: Literal["auto", "random", "modulo", "stratified"]
fold_column=None, # type: Optional[str]
response_column=None, # type: Optional[str]
ignored_columns=None, # type: Optional[List[str]]
ignore_const_cols=True, # type: bool
offset_column=None, # type: Optional[str]
weights_column=None, # type: Optional[str]
balance_classes=False, # type: bool
class_sampling_factors=None, # type: Optional[List[float]]
max_after_balance_size=5.0, # type: float
max_confusion_matrix_size=20, # type: int
ntrees=50, # type: int
max_depth=5, # type: int
min_rows=10.0, # type: float
nbins=20, # type: int
nbins_top_level=1024, # type: int
nbins_cats=1024, # type: int
r2_stopping=None, # type: Optional[float]
stopping_rounds=0, # type: int
stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]
stopping_tolerance=0.001, # type: float
max_runtime_secs=0.0, # type: float
seed=-1, # type: int
build_tree_one_node=False, # type: bool
learn_rate=0.1, # type: float
learn_rate_annealing=1.0, # type: float
distribution="auto", # type: Literal["auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber", "custom"]
quantile_alpha=0.5, # type: float
tweedie_power=1.5, # type: float
huber_alpha=0.9, # type: float
checkpoint=None, # type: Optional[Union[None, str, H2OEstimator]]
sample_rate=1.0, # type: float
sample_rate_per_class=None, # type: Optional[List[float]]
col_sample_rate=1.0, # type: float
col_sample_rate_change_per_level=1.0, # type: float
col_sample_rate_per_tree=1.0, # type: float
min_split_improvement=1e-05, # type: float
histogram_type="auto", # type: Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin", "uniform_robust"]
max_abs_leafnode_pred=None, # type: Optional[float]
pred_noise_bandwidth=0.0, # type: float
categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]
calibrate_model=False, # type: bool
calibration_frame=None, # type: Optional[Union[None, str, H2OFrame]]
calibration_method="auto", # type: Literal["auto", "platt_scaling", "isotonic_regression"]
custom_metric_func=None, # type: Optional[str]
custom_distribution_func=None, # type: Optional[str]
export_checkpoints_dir=None, # type: Optional[str]
in_training_checkpoints_dir=None, # type: Optional[str]
in_training_checkpoints_tree_interval=1, # type: int
monotone_constraints=None, # type: Optional[dict]
check_constant_response=True, # type: bool
gainslift_bins=-1, # type: int
auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
interaction_constraints=None, # type: Optional[List[List[str]]]
auto_rebalance=True, # type: bool
):
"""
:param model_id: Destination id for this model; auto-generated if not specified.
Defaults to ``None``.
:type model_id: Union[None, str, H2OEstimator], optional
:param training_frame: Id of the training data frame.
Defaults to ``None``.
:type training_frame: Union[None, str, H2OFrame], optional
:param validation_frame: Id of the validation data frame.
Defaults to ``None``.
:type validation_frame: Union[None, str, H2OFrame], optional
:param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2).
Defaults to ``0``.
:type nfolds: int
:param keep_cross_validation_models: Whether to keep the cross-validation models.
Defaults to ``True``.
:type keep_cross_validation_models: bool
:param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models.
Defaults to ``False``.
:type keep_cross_validation_predictions: bool
:param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment.
Defaults to ``False``.
:type keep_cross_validation_fold_assignment: bool
:param score_each_iteration: Whether to score during each iteration of model training.
Defaults to ``False``.
:type score_each_iteration: bool
:param score_tree_interval: Score the model after every so many trees. Disabled if set to 0.
Defaults to ``0``.
:type score_tree_interval: int
:param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The
'Stratified' option will stratify the folds based on the response variable, for classification problems.
Defaults to ``"auto"``.
:type fold_assignment: Literal["auto", "random", "modulo", "stratified"]
:param fold_column: Column with cross-validation fold index assignment per observation.
Defaults to ``None``.
:type fold_column: str, optional
:param response_column: Response variable column.
Defaults to ``None``.
:type response_column: str, optional
:param ignored_columns: Names of columns to ignore for training.
Defaults to ``None``.
:type ignored_columns: List[str], optional
:param ignore_const_cols: Ignore constant columns.
Defaults to ``True``.
:type ignore_const_cols: bool
:param offset_column: Offset column. This will be added to the combination of columns before applying the link
function.
Defaults to ``None``.
:type offset_column: str, optional
:param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
not increase the size of the data frame. This is typically the number of times a row is repeated, but
non-integer values are supported as well. During training, rows with higher weights matter more, due to
the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
Defaults to ``None``.
:type weights_column: str, optional
:param balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).
Defaults to ``False``.
:type balance_classes: bool
:param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not
specified, sampling factors will be automatically computed to obtain class balance during training.
Requires balance_classes.
Defaults to ``None``.
:type class_sampling_factors: List[float], optional
:param max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be
less than 1.0). Requires balance_classes.
Defaults to ``5.0``.
:type max_after_balance_size: float
:param max_confusion_matrix_size: [Deprecated] Maximum size (# classes) for confusion matrices to be printed in
the Logs
Defaults to ``20``.
:type max_confusion_matrix_size: int
:param ntrees: Number of trees.
Defaults to ``50``.
:type ntrees: int
:param max_depth: Maximum tree depth (0 for unlimited).
Defaults to ``5``.
:type max_depth: int
:param min_rows: Fewest allowed (weighted) observations in a leaf.
Defaults to ``10.0``.
:type min_rows: float
:param nbins: For numerical columns (real/int), build a histogram of (at least) this many bins, then split at
the best point
Defaults to ``20``.
:type nbins: int
:param nbins_top_level: For numerical columns (real/int), build a histogram of (at most) this many bins at the
root level, then decrease by factor of two per level
Defaults to ``1024``.
:type nbins_top_level: int
:param nbins_cats: For categorical columns (factors), build a histogram of this many bins, then split at the
best point. Higher values can lead to more overfitting.
Defaults to ``1024``.
:type nbins_cats: int
:param r2_stopping: r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds,
stopping_metric and stopping_tolerance instead. Previous version of H2O would stop making trees when the
R^2 metric equals or exceeds this
Defaults to ``∞``.
:type r2_stopping: float
:param stopping_rounds: Early stopping based on convergence of stopping_metric. Stop if simple moving average of
length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
Defaults to ``0``.
:type stopping_rounds: int
:param stopping_metric: Metric to use for early stopping (AUTO: logloss for classification, deviance for
regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be
used in GBM and DRF with the Python client.
Defaults to ``"auto"``.
:type stopping_metric: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group",
"misclassification", "mean_per_class_error", "custom", "custom_increasing"]
:param stopping_tolerance: Relative tolerance for metric-based stopping criterion (stop if relative improvement
is not at least this much)
Defaults to ``0.001``.
:type stopping_tolerance: float
:param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
Defaults to ``0.0``.
:type max_runtime_secs: float
:param seed: Seed for pseudo random number generator (if applicable)
Defaults to ``-1``.
:type seed: int
:param build_tree_one_node: Run on one node only; no network overhead but fewer cpus used. Suitable for small
datasets.
Defaults to ``False``.
:type build_tree_one_node: bool
:param learn_rate: Learning rate (from 0.0 to 1.0)
Defaults to ``0.1``.
:type learn_rate: float
:param learn_rate_annealing: Scale the learning rate by this factor after each tree (e.g., 0.99 or 0.999)
Defaults to ``1.0``.
:type learn_rate_annealing: float
:param distribution: Distribution function
Defaults to ``"auto"``.
:type distribution: Literal["auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie",
"laplace", "quantile", "huber", "custom"]
:param quantile_alpha: Desired quantile for Quantile regression, must be between 0 and 1.
Defaults to ``0.5``.
:type quantile_alpha: float
:param tweedie_power: Tweedie power for Tweedie regression, must be between 1 and 2.
Defaults to ``1.5``.
:type tweedie_power: float
:param huber_alpha: Desired quantile for Huber/M-regression (threshold between quadratic and linear loss, must
be between 0 and 1).
Defaults to ``0.9``.
:type huber_alpha: float
:param checkpoint: Model checkpoint to resume training with.
Defaults to ``None``.
:type checkpoint: Union[None, str, H2OEstimator], optional
:param sample_rate: Row sample rate per tree (from 0.0 to 1.0)
Defaults to ``1.0``.
:type sample_rate: float
:param sample_rate_per_class: A list of row sample rates per class (relative fraction for each class, from 0.0
to 1.0), for each tree
Defaults to ``None``.
:type sample_rate_per_class: List[float], optional
:param col_sample_rate: Column sample rate (from 0.0 to 1.0)
Defaults to ``1.0``.
:type col_sample_rate: float
:param col_sample_rate_change_per_level: Relative change of the column sampling rate for every level (must be >
0.0 and <= 2.0)
Defaults to ``1.0``.
:type col_sample_rate_change_per_level: float
:param col_sample_rate_per_tree: Column sample rate per tree (from 0.0 to 1.0)
Defaults to ``1.0``.
:type col_sample_rate_per_tree: float
:param min_split_improvement: Minimum relative improvement in squared error reduction for a split to happen
Defaults to ``1e-05``.
:type min_split_improvement: float
:param histogram_type: What type of histogram to use for finding optimal split points
Defaults to ``"auto"``.
:type histogram_type: Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin", "uniform_robust"]
:param max_abs_leafnode_pred: Maximum absolute value of a leaf node prediction
Defaults to ``∞``.
:type max_abs_leafnode_pred: float
:param pred_noise_bandwidth: Bandwidth (sigma) of Gaussian multiplicative noise ~N(1,sigma) for tree node
predictions
Defaults to ``0.0``.
:type pred_noise_bandwidth: float
:param categorical_encoding: Encoding scheme for categorical features
Defaults to ``"auto"``.
:type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
"sort_by_response", "enum_limited"]
:param calibrate_model: Use Platt Scaling (default) or Isotonic Regression to calculate calibrated class
probabilities. Calibration can provide more accurate estimates of class probabilities.
Defaults to ``False``.
:type calibrate_model: bool
:param calibration_frame: Data for model calibration
Defaults to ``None``.
:type calibration_frame: Union[None, str, H2OFrame], optional
:param calibration_method: Calibration method to use
Defaults to ``"auto"``.
:type calibration_method: Literal["auto", "platt_scaling", "isotonic_regression"]
:param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName`
Defaults to ``None``.
:type custom_metric_func: str, optional
:param custom_distribution_func: Reference to custom distribution, format: `language:keyName=funcName`
Defaults to ``None``.
:type custom_distribution_func: str, optional
:param export_checkpoints_dir: Automatically export generated models to this directory.
Defaults to ``None``.
:type export_checkpoints_dir: str, optional
:param in_training_checkpoints_dir: Create checkpoints into defined directory while training process is still
running. In case of cluster shutdown, this checkpoint can be used to restart training.
Defaults to ``None``.
:type in_training_checkpoints_dir: str, optional
:param in_training_checkpoints_tree_interval: Checkpoint the model after every so many trees. Parameter is used
only when in_training_checkpoints_dir is defined
Defaults to ``1``.
:type in_training_checkpoints_tree_interval: int
:param monotone_constraints: A mapping representing monotonic constraints. Use +1 to enforce an increasing
constraint and -1 to specify a decreasing constraint.
Defaults to ``None``.
:type monotone_constraints: dict, optional
:param check_constant_response: Check if response column is constant. If enabled, then an exception is thrown if
the response column is a constant value.If disabled, then model will train regardless of the response
column being a constant value or not.
Defaults to ``True``.
:type check_constant_response: bool
:param gainslift_bins: Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic
binning.
Defaults to ``-1``.
:type gainslift_bins: int
:param auc_type: Set default multinomial AUC type.
Defaults to ``"auto"``.
:type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
:param interaction_constraints: A set of allowed column interactions.
Defaults to ``None``.
:type interaction_constraints: List[List[str]], optional
:param auto_rebalance: Allow automatic rebalancing of training and validation datasets
Defaults to ``True``.
:type auto_rebalance: bool
"""
super(H2OGradientBoostingEstimator, self).__init__()
self._parms = {}
self._id = self._parms['model_id'] = model_id
self.training_frame = training_frame
self.validation_frame = validation_frame
self.nfolds = nfolds
self.keep_cross_validation_models = keep_cross_validation_models
self.keep_cross_validation_predictions = keep_cross_validation_predictions
self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment
self.score_each_iteration = score_each_iteration
self.score_tree_interval = score_tree_interval
self.fold_assignment = fold_assignment
self.fold_column = fold_column
self.response_column = response_column
self.ignored_columns = ignored_columns
self.ignore_const_cols = ignore_const_cols
self.offset_column = offset_column
self.weights_column = weights_column
self.balance_classes = balance_classes
self.class_sampling_factors = class_sampling_factors
self.max_after_balance_size = max_after_balance_size
self.max_confusion_matrix_size = max_confusion_matrix_size
self.ntrees = ntrees
self.max_depth = max_depth
self.min_rows = min_rows
self.nbins = nbins
self.nbins_top_level = nbins_top_level
self.nbins_cats = nbins_cats
self.r2_stopping = r2_stopping
self.stopping_rounds = stopping_rounds
self.stopping_metric = stopping_metric
self.stopping_tolerance = stopping_tolerance
self.max_runtime_secs = max_runtime_secs
self.seed = seed
self.build_tree_one_node = build_tree_one_node
self.learn_rate = learn_rate
self.learn_rate_annealing = learn_rate_annealing
self.distribution = distribution
self.quantile_alpha = quantile_alpha
self.tweedie_power = tweedie_power
self.huber_alpha = huber_alpha
self.checkpoint = checkpoint
self.sample_rate = sample_rate
self.sample_rate_per_class = sample_rate_per_class
self.col_sample_rate = col_sample_rate
self.col_sample_rate_change_per_level = col_sample_rate_change_per_level
self.col_sample_rate_per_tree = col_sample_rate_per_tree
self.min_split_improvement = min_split_improvement
self.histogram_type = histogram_type
self.max_abs_leafnode_pred = max_abs_leafnode_pred
self.pred_noise_bandwidth = pred_noise_bandwidth
self.categorical_encoding = categorical_encoding
self.calibrate_model = calibrate_model
self.calibration_frame = calibration_frame
self.calibration_method = calibration_method
self.custom_metric_func = custom_metric_func
self.custom_distribution_func = custom_distribution_func
self.export_checkpoints_dir = export_checkpoints_dir
self.in_training_checkpoints_dir = in_training_checkpoints_dir
self.in_training_checkpoints_tree_interval = in_training_checkpoints_tree_interval
self.monotone_constraints = monotone_constraints
self.check_constant_response = check_constant_response
self.gainslift_bins = gainslift_bins
self.auc_type = auc_type
self.interaction_constraints = interaction_constraints
self.auto_rebalance = auto_rebalance
@property
def training_frame(self):
"""
Id of the training data frame.
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("training_frame")
@training_frame.setter
def training_frame(self, training_frame):
self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
@property
def validation_frame(self):
"""
Id of the validation data frame.
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("validation_frame")
@validation_frame.setter
def validation_frame(self, validation_frame):
self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
@property
def nfolds(self):
"""
Number of folds for K-fold cross-validation (0 to disable or >= 2).
Type: ``int``, defaults to ``0``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> cars_gbm = H2OGradientBoostingEstimator(nfolds=folds,
... seed=1234
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=cars)
>>> cars_gbm.auc()
"""
return self._parms.get("nfolds")
@nfolds.setter
def nfolds(self, nfolds):
assert_is_type(nfolds, None, int)
self._parms["nfolds"] = nfolds
@property
def keep_cross_validation_models(self):
"""
Whether to keep the cross-validation models.
Type: ``bool``, defaults to ``True``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_models=True,
... nfolds=5,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc()
"""
return self._parms.get("keep_cross_validation_models")
@keep_cross_validation_models.setter
def keep_cross_validation_models(self, keep_cross_validation_models):
assert_is_type(keep_cross_validation_models, None, bool)
self._parms["keep_cross_validation_models"] = keep_cross_validation_models
@property
def keep_cross_validation_predictions(self):
"""
Whether to keep the predictions of the cross-validation models.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_predictions=True,
... nfolds=5,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc()
"""
return self._parms.get("keep_cross_validation_predictions")
@keep_cross_validation_predictions.setter
def keep_cross_validation_predictions(self, keep_cross_validation_predictions):
assert_is_type(keep_cross_validation_predictions, None, bool)
self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions
@property
def keep_cross_validation_fold_assignment(self):
"""
Whether to keep the cross-validation fold assignment.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_fold_assignment=True,
... nfolds=5,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc()
"""
return self._parms.get("keep_cross_validation_fold_assignment")
@keep_cross_validation_fold_assignment.setter
def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment):
assert_is_type(keep_cross_validation_fold_assignment, None, bool)
self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment
@property
def score_each_iteration(self):
"""
Whether to score during each iteration of model training.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(score_each_iteration=True,
... ntrees=55,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.scoring_history()
"""
return self._parms.get("score_each_iteration")
@score_each_iteration.setter
def score_each_iteration(self, score_each_iteration):
assert_is_type(score_each_iteration, None, bool)
self._parms["score_each_iteration"] = score_each_iteration
@property
def score_tree_interval(self):
"""
Score the model after every so many trees. Disabled if set to 0.
Type: ``int``, defaults to ``0``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(score_tree_interval=True,
... ntrees=55,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.scoring_history()
"""
return self._parms.get("score_tree_interval")
@score_tree_interval.setter
def score_tree_interval(self, score_tree_interval):
assert_is_type(score_tree_interval, None, int)
self._parms["score_tree_interval"] = score_tree_interval
@property
def fold_assignment(self):
"""
Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
the folds based on the response variable, for classification problems.
Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> assignment_type = "Random"
>>> cars_gbm = H2OGradientBoostingEstimator(fold_assignment=assignment_type,
... nfolds=5,
... seed=1234)
>>> cars_gbm.train(x=predictors, y=response, training_frame=cars)
>>> cars_gbm.auc(xval=True)
"""
return self._parms.get("fold_assignment")
@fold_assignment.setter
def fold_assignment(self, fold_assignment):
assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified"))
self._parms["fold_assignment"] = fold_assignment
@property
def fold_column(self):
"""
Column with cross-validation fold index assignment per observation.
Type: ``str``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> fold_numbers = cars.kfold_column(n_folds=5,
... seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=cars,
... fold_column="fold_numbers")
>>> cars_gbm.auc(xval=True)
"""
return self._parms.get("fold_column")
@fold_column.setter
def fold_column(self, fold_column):
assert_is_type(fold_column, None, str)
self._parms["fold_column"] = fold_column
@property
def response_column(self):
"""
Response variable column.
Type: ``str``.
"""
return self._parms.get("response_column")
@response_column.setter
def response_column(self, response_column):
assert_is_type(response_column, None, str)
self._parms["response_column"] = response_column
@property
def ignored_columns(self):
"""
Names of columns to ignore for training.
Type: ``List[str]``.
"""
return self._parms.get("ignored_columns")
@ignored_columns.setter
def ignored_columns(self, ignored_columns):
assert_is_type(ignored_columns, None, [str])
self._parms["ignored_columns"] = ignored_columns
@property
def ignore_const_cols(self):
"""
Ignore constant columns.
Type: ``bool``, defaults to ``True``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234,
... ignore_const_cols=True)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("ignore_const_cols")
@ignore_const_cols.setter
def ignore_const_cols(self, ignore_const_cols):
assert_is_type(ignore_const_cols, None, bool)
self._parms["ignore_const_cols"] = ignore_const_cols
@property
def offset_column(self):
"""
Offset column. This will be added to the combination of columns before applying the link function.
Type: ``str``.
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> boston["offset"] = boston["medv"].log()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_gbm = H2OGradientBoostingEstimator(offset_column="offset",
... seed=1234)
>>> boston_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> boston_gbm.mse(valid=True)
"""
return self._parms.get("offset_column")
@offset_column.setter
def offset_column(self, offset_column):
assert_is_type(offset_column, None, str)
self._parms["offset_column"] = offset_column
@property
def weights_column(self):
"""
Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
accurate prediction, remove all rows with weight == 0.
Type: ``str``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid,
... weights_column="weight")
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("weights_column")
@weights_column.setter
def weights_column(self, weights_column):
assert_is_type(weights_column, None, str)
self._parms["weights_column"] = weights_column
@property
def balance_classes(self):
"""
Balance training data class counts via over/under-sampling (for imbalanced data).
Type: ``bool``, defaults to ``False``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("balance_classes")
@balance_classes.setter
def balance_classes(self, balance_classes):
assert_is_type(balance_classes, None, bool)
self._parms["balance_classes"] = balance_classes
@property
def class_sampling_factors(self):
"""
Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
be automatically computed to obtain class balance during training. Requires balance_classes.
Type: ``List[float]``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
... class_sampling_factors=sample_factors,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("class_sampling_factors")
@class_sampling_factors.setter
def class_sampling_factors(self, class_sampling_factors):
assert_is_type(class_sampling_factors, None, [float])
self._parms["class_sampling_factors"] = class_sampling_factors
@property
def max_after_balance_size(self):
"""
Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires
balance_classes.
Type: ``float``, defaults to ``5.0``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> max = .85
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
... max_after_balance_size=max,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("max_after_balance_size")
@max_after_balance_size.setter
def max_after_balance_size(self, max_after_balance_size):
assert_is_type(max_after_balance_size, None, float)
self._parms["max_after_balance_size"] = max_after_balance_size
@property
def max_confusion_matrix_size(self):
"""
[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs
Type: ``int``, defaults to ``20``.
"""
return self._parms.get("max_confusion_matrix_size")
@max_confusion_matrix_size.setter
def max_confusion_matrix_size(self, max_confusion_matrix_size):
assert_is_type(max_confusion_matrix_size, None, int)
self._parms["max_confusion_matrix_size"] = max_confusion_matrix_size
@property
def ntrees(self):
"""
Number of trees.
Type: ``int``, defaults to ``50``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> tree_num = [20, 50, 80, 110, 140, 170, 200]
>>> label = ["20", "50", "80", "110", "140", "170", "200"]
>>> for key, num in enumerate(tree_num):
... titanic_gbm = H2OGradientBoostingEstimator(ntrees=num,
... seed=1234)
... titanic_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
... print(label[key], 'training score', titanic_gbm.auc(train=True))
... print(label[key], 'validation score', titanic_gbm.auc(valid=True))
"""
return self._parms.get("ntrees")
@ntrees.setter
def ntrees(self, ntrees):
assert_is_type(ntrees, None, int)
self._parms["ntrees"] = ntrees
@property
def max_depth(self):
"""
Maximum tree depth (0 for unlimited).
Type: ``int``, defaults to ``5``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(ntrees=100,
... max_depth=2,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("max_depth")
@max_depth.setter
def max_depth(self, max_depth):
assert_is_type(max_depth, None, int)
self._parms["max_depth"] = max_depth
@property
def min_rows(self):
"""
Fewest allowed (weighted) observations in a leaf.
Type: ``float``, defaults to ``10.0``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(min_rows=16,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("min_rows")
@min_rows.setter
def min_rows(self, min_rows):
assert_is_type(min_rows, None, numeric)
self._parms["min_rows"] = min_rows
@property
def nbins(self):
"""
For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point
Type: ``int``, defaults to ``20``.
:examples:
>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [16, 32, 64, 128, 256, 512]
>>> label = ["16", "32", "64", "128", "256", "512"]
>>> for key, num in enumerate(bin_num):
... eeg_gbm = H2OGradientBoostingEstimator(nbins=num, seed=1234)
... eeg_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
... print(label[key], 'training score', eeg_gbm.auc(train=True))
... print(label[key], 'validation score', eeg_gbm.auc(valid=True))
"""
return self._parms.get("nbins")
@nbins.setter
def nbins(self, nbins):
assert_is_type(nbins, None, int)
self._parms["nbins"] = nbins
@property
def nbins_top_level(self):
"""
For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease
by factor of two per level
Type: ``int``, defaults to ``1024``.
:examples:
>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [32, 64, 128, 256, 512, 1024, 2048, 4096]
>>> label = ["32", "64", "128", "256", "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
... eeg_gbm = H2OGradientBoostingEstimator(nbins_top_level=num, seed=1234)
... eeg_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
... print(label[key], 'training score', eeg_gbm.auc(train=True))
... print(label[key], 'validation score', eeg_gbm.auc(valid=True))
"""
return self._parms.get("nbins_top_level")
@nbins_top_level.setter
def nbins_top_level(self, nbins_top_level):
assert_is_type(nbins_top_level, None, int)
self._parms["nbins_top_level"] = nbins_top_level
@property
def nbins_cats(self):
"""
For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher
values can lead to more overfitting.
Type: ``int``, defaults to ``1024``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
>>> label = ["8", "16", "32", "64", "128", "256", "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
... airlines_gbm = H2OGradientBoostingEstimator(nbins_cats=num, seed=1234)
... airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
... print(label[key], 'training score', airlines_gbm.auc(train=True))
... print(label[key], 'validation score', airlines_gbm.auc(valid=True))
"""
return self._parms.get("nbins_cats")
@nbins_cats.setter
def nbins_cats(self, nbins_cats):
assert_is_type(nbins_cats, None, int)
self._parms["nbins_cats"] = nbins_cats
@property
def r2_stopping(self):
"""
r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and
stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or
exceeds this
Type: ``float``, defaults to ``∞``.
"""
return self._parms.get("r2_stopping")
@r2_stopping.setter
def r2_stopping(self, r2_stopping):
assert_is_type(r2_stopping, None, numeric)
self._parms["r2_stopping"] = r2_stopping
@property
def stopping_rounds(self):
"""
Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
Type: ``int``, defaults to ``0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("stopping_rounds")
@stopping_rounds.setter
def stopping_rounds(self, stopping_rounds):
assert_is_type(stopping_rounds, None, int)
self._parms["stopping_rounds"] = stopping_rounds
@property
def stopping_metric(self):
"""
Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score
for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python
client.
Type: ``Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group",
"misclassification", "mean_per_class_error", "custom", "custom_increasing"]``, defaults to ``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("stopping_metric")
@stopping_metric.setter
def stopping_metric(self, stopping_metric):
assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"))
self._parms["stopping_metric"] = stopping_metric
@property
def stopping_tolerance(self):
"""
Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)
Type: ``float``, defaults to ``0.001``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("stopping_tolerance")
@stopping_tolerance.setter
def stopping_tolerance(self, stopping_tolerance):
assert_is_type(stopping_tolerance, None, numeric)
self._parms["stopping_tolerance"] = stopping_tolerance
@property
def max_runtime_secs(self):
"""
Maximum allowed runtime in seconds for model training. Use 0 to disable.
Type: ``float``, defaults to ``0.0``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(max_runtime_secs=10,
... ntrees=10000,
... max_depth=10,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("max_runtime_secs")
@max_runtime_secs.setter
def max_runtime_secs(self, max_runtime_secs):
assert_is_type(max_runtime_secs, None, numeric)
self._parms["max_runtime_secs"] = max_runtime_secs
@property
def seed(self):
"""
Seed for pseudo random number generator (if applicable)
Type: ``int``, defaults to ``-1``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> gbm_w_seed_1 = H2OGradientBoostingEstimator(col_sample_rate=.7,
... seed=1234)
>>> gbm_w_seed_1.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('auc for the 1st model built with a seed:', gbm_w_seed_1.auc(valid=True))
"""
return self._parms.get("seed")
@seed.setter
def seed(self, seed):
assert_is_type(seed, None, int)
self._parms["seed"] = seed
@property
def build_tree_one_node(self):
"""
Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(build_tree_one_node=True,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("build_tree_one_node")
@build_tree_one_node.setter
def build_tree_one_node(self, build_tree_one_node):
assert_is_type(build_tree_one_node, None, bool)
self._parms["build_tree_one_node"] = build_tree_one_node
@property
def learn_rate(self):
"""
Learning rate (from 0.0 to 1.0)
Type: ``float``, defaults to ``0.1``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(ntrees=10000,
... learn_rate=0.01,
... stopping_rounds=5,
... stopping_metric="AUC",
... stopping_tolerance=1e-4,
... seed=1234)
>>> titanic_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_gbm.auc(valid=True)
"""
return self._parms.get("learn_rate")
@learn_rate.setter
def learn_rate(self, learn_rate):
assert_is_type(learn_rate, None, numeric)
self._parms["learn_rate"] = learn_rate
@property
def learn_rate_annealing(self):
"""
Scale the learning rate by this factor after each tree (e.g., 0.99 or 0.999)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(ntrees=10000,
... learn_rate=0.05,
... learn_rate_annealing=.9,
... stopping_rounds=5,
... stopping_metric="AUC",
... stopping_tolerance=1e-4,
... seed=1234)
>>> titanic_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_gbm.auc(valid=True)
"""
return self._parms.get("learn_rate_annealing")
@learn_rate_annealing.setter
def learn_rate_annealing(self, learn_rate_annealing):
assert_is_type(learn_rate_annealing, None, numeric)
self._parms["learn_rate_annealing"] = learn_rate_annealing
@property
def distribution(self):
"""
Distribution function
Type: ``Literal["auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie",
"laplace", "quantile", "huber", "custom"]``, defaults to ``"auto"``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(distribution="poisson",
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.mse(valid=True)
"""
return self._parms.get("distribution")
@distribution.setter
def distribution(self, distribution):
assert_is_type(distribution, None, Enum("auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber", "custom"))
self._parms["distribution"] = distribution
@property
def quantile_alpha(self):
"""
Desired quantile for Quantile regression, must be between 0 and 1.
Type: ``float``, defaults to ``0.5``.
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_gbm = H2OGradientBoostingEstimator(distribution="quantile",
... quantile_alpha=.8,
... seed=1234)
>>> boston_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> boston_gbm.mse(valid=True)
"""
return self._parms.get("quantile_alpha")
@quantile_alpha.setter
def quantile_alpha(self, quantile_alpha):
assert_is_type(quantile_alpha, None, numeric)
self._parms["quantile_alpha"] = quantile_alpha
@property
def tweedie_power(self):
"""
Tweedie power for Tweedie regression, must be between 1 and 2.
Type: ``float``, defaults to ``1.5``.
:examples:
>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8], seed=1234)
>>> insurance_gbm = H2OGradientBoostingEstimator(distribution="tweedie",
... tweedie_power=1.2,
... seed=1234)
>>> insurance_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> insurance_gbm.mse(valid=True)
"""
return self._parms.get("tweedie_power")
@tweedie_power.setter
def tweedie_power(self, tweedie_power):
assert_is_type(tweedie_power, None, numeric)
self._parms["tweedie_power"] = tweedie_power
@property
def huber_alpha(self):
"""
Desired quantile for Huber/M-regression (threshold between quadratic and linear loss, must be between 0 and 1).
Type: ``float``, defaults to ``0.9``.
:examples:
>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8], seed=1234)
>>> insurance_gbm = H2OGradientBoostingEstimator(distribution="huber",
... huber_alpha=0.9,
... seed=1234)
>>> insurance_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> insurance_gbm.mse(valid=True)
"""
return self._parms.get("huber_alpha")
@huber_alpha.setter
def huber_alpha(self, huber_alpha):
assert_is_type(huber_alpha, None, numeric)
self._parms["huber_alpha"] = huber_alpha
@property
def checkpoint(self):
"""
Model checkpoint to resume training with.
Type: ``Union[None, str, H2OEstimator]``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(ntrees=1,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cars_gbm.auc(valid=True))
>>> print("Number of trees built for cars_gbm model:", cars_gbm.ntrees)
>>> cars_gbm_continued = H2OGradientBoostingEstimator(checkpoint=cars_gbm.model_id,
... ntrees=50,
... seed=1234)
>>> cars_gbm_continued.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm_continued.auc(valid=True)
>>> print("Number of trees built for cars_gbm model:",cars_gbm_continued.ntrees)
"""
return self._parms.get("checkpoint")
@checkpoint.setter
def checkpoint(self, checkpoint):
assert_is_type(checkpoint, None, str, H2OEstimator)
self._parms["checkpoint"] = checkpoint
@property
def sample_rate(self):
"""
Row sample rate per tree (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(sample_rate=.7,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("sample_rate")
@sample_rate.setter
def sample_rate(self, sample_rate):
assert_is_type(sample_rate, None, numeric)
self._parms["sample_rate"] = sample_rate
@property
def sample_rate_per_class(self):
"""
A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree
Type: ``List[float]``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> rate_per_class_list = [1, .4, 1, 1, 1, 1, 1]
>>> cov_gbm = H2OGradientBoostingEstimator(sample_rate_per_class=rate_per_class_list,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("sample_rate_per_class")
@sample_rate_per_class.setter
def sample_rate_per_class(self, sample_rate_per_class):
assert_is_type(sample_rate_per_class, None, [numeric])
self._parms["sample_rate_per_class"] = sample_rate_per_class
@property
def col_sample_rate(self):
"""
Column sample rate (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate=.7,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("col_sample_rate")
@col_sample_rate.setter
def col_sample_rate(self, col_sample_rate):
assert_is_type(col_sample_rate, None, numeric)
self._parms["col_sample_rate"] = col_sample_rate
@property
def col_sample_rate_change_per_level(self):
"""
Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate_change_per_level=.9,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("col_sample_rate_change_per_level")
@col_sample_rate_change_per_level.setter
def col_sample_rate_change_per_level(self, col_sample_rate_change_per_level):
assert_is_type(col_sample_rate_change_per_level, None, numeric)
self._parms["col_sample_rate_change_per_level"] = col_sample_rate_change_per_level
@property
def col_sample_rate_per_tree(self):
"""
Column sample rate per tree (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate_per_tree=.7,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("col_sample_rate_per_tree")
@col_sample_rate_per_tree.setter
def col_sample_rate_per_tree(self, col_sample_rate_per_tree):
assert_is_type(col_sample_rate_per_tree, None, numeric)
self._parms["col_sample_rate_per_tree"] = col_sample_rate_per_tree
@property
def min_split_improvement(self):
"""
Minimum relative improvement in squared error reduction for a split to happen
Type: ``float``, defaults to ``1e-05``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(min_split_improvement=1e-3,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("min_split_improvement")
@min_split_improvement.setter
def min_split_improvement(self, min_split_improvement):
assert_is_type(min_split_improvement, None, numeric)
self._parms["min_split_improvement"] = min_split_improvement
@property
def histogram_type(self):
"""
What type of histogram to use for finding optimal split points
Type: ``Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin", "uniform_robust"]``,
defaults to ``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(histogram_type="UniformAdaptive",
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("histogram_type")
@histogram_type.setter
def histogram_type(self, histogram_type):
assert_is_type(histogram_type, None, Enum("auto", "uniform_adaptive", "random", "quantiles_global", "round_robin", "uniform_robust"))
self._parms["histogram_type"] = histogram_type
@property
def max_abs_leafnode_pred(self):
"""
Maximum absolute value of a leaf node prediction
Type: ``float``, defaults to ``∞``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_gbm = H2OGradientBoostingEstimator(max_abs_leafnode_pred=2,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("max_abs_leafnode_pred")
@max_abs_leafnode_pred.setter
def max_abs_leafnode_pred(self, max_abs_leafnode_pred):
assert_is_type(max_abs_leafnode_pred, None, numeric)
self._parms["max_abs_leafnode_pred"] = max_abs_leafnode_pred
@property
def pred_noise_bandwidth(self):
"""
Bandwidth (sigma) of Gaussian multiplicative noise ~N(1,sigma) for tree node predictions
Type: ``float``, defaults to ``0.0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(pred_noise_bandwidth=0.1,
... seed=1234)
>>> titanic_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_gbm.auc(valid = True)
"""
return self._parms.get("pred_noise_bandwidth")
@pred_noise_bandwidth.setter
def pred_noise_bandwidth(self, pred_noise_bandwidth):
assert_is_type(pred_noise_bandwidth, None, numeric)
self._parms["pred_noise_bandwidth"] = pred_noise_bandwidth
@property
def categorical_encoding(self):
"""
Encoding scheme for categorical features
Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
"sort_by_response", "enum_limited"]``, defaults to ``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(categorical_encoding="labelencoder",
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("categorical_encoding")
@categorical_encoding.setter
def categorical_encoding(self, categorical_encoding):
assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
self._parms["categorical_encoding"] = categorical_encoding
@property
def calibrate_model(self):
"""
Use Platt Scaling (default) or Isotonic Regression to calculate calibrated class probabilities. Calibration can
provide more accurate estimates of class probabilities.
Type: ``bool``, defaults to ``False``.
:examples:
>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> response = 'Angaus'
>>> train, calib = ecology.split_frame(seed = 12354)
>>> predictors = ecology.columns[3:13]
>>> w = h2o.create_frame(binary_fraction=1,
... binary_ones_fraction=0.5,
... missing_fraction=0,
... rows=744, cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_gbm = H2OGradientBoostingEstimator(ntrees=10,
... max_depth=5,
... min_rows=10,
... learn_rate=0.1,
... distribution="multinomial",
... weights_column="weight",
... calibrate_model=True,
... calibration_frame=calib)
>>> ecology_gbm.train(x=predictors,
... y="Angaus",
... training_frame=train)
>>> ecology_gbm.auc()
"""
return self._parms.get("calibrate_model")
@calibrate_model.setter
def calibrate_model(self, calibrate_model):
assert_is_type(calibrate_model, None, bool)
self._parms["calibrate_model"] = calibrate_model
@property
def calibration_frame(self):
"""
Data for model calibration
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> response = 'Angaus'
>>> predictors = ecology.columns[3:13]
>>> train, calib = ecology.split_frame(seed=12354)
>>> w = h2o.create_frame(binary_fraction=1,
... binary_ones_fraction=0.5,
... missing_fraction=0,
... rows=744,cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_gbm = H2OGradientBoostingEstimator(ntrees=10,
... max_depth=5,
... min_rows=10,
... learn_rate=0.1,
... distribution="multinomial",
... calibrate_model=True,
... calibration_frame=calib)
>>> ecology_gbm.train(x=predictors,
... y="Angaus",
... training_frame=train,
... weights_column="weight")
>>> ecology_gbm.auc()
"""
return self._parms.get("calibration_frame")
@calibration_frame.setter
def calibration_frame(self, calibration_frame):
self._parms["calibration_frame"] = H2OFrame._validate(calibration_frame, 'calibration_frame')
@property
def calibration_method(self):
"""
Calibration method to use
Type: ``Literal["auto", "platt_scaling", "isotonic_regression"]``, defaults to ``"auto"``.
"""
return self._parms.get("calibration_method")
@calibration_method.setter
def calibration_method(self, calibration_method):
assert_is_type(calibration_method, None, Enum("auto", "platt_scaling", "isotonic_regression"))
self._parms["calibration_method"] = calibration_method
@property
def custom_metric_func(self):
"""
Reference to custom evaluation function, format: `language:keyName=funcName`
Type: ``str``.
"""
return self._parms.get("custom_metric_func")
@custom_metric_func.setter
def custom_metric_func(self, custom_metric_func):
assert_is_type(custom_metric_func, None, str)
self._parms["custom_metric_func"] = custom_metric_func
@property
def custom_distribution_func(self):
"""
Reference to custom distribution, format: `language:keyName=funcName`
Type: ``str``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(ntrees=3,
... max_depth=5,
... distribution="bernoulli",
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame valid)
>>> from h2o.utils.distributions import CustomDistributionBernoulli
>>> custom_distribution_bernoulli = h2o.upload_custom_distribution(CustomDistributionBernoulli,
... func_name="custom_bernoulli",
... func_file="custom_bernoulli.py")
>>> airlines_gbm_custom = H2OGradientBoostingEstimator(ntrees=3,
... max_depth=5,
... distribution="custom",
... custom_distribution_func=custom_distribution_bernoulli,
... seed=1235)
>>> airlines_gbm_custom.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc()
"""
return self._parms.get("custom_distribution_func")
@custom_distribution_func.setter
def custom_distribution_func(self, custom_distribution_func):
assert_is_type(custom_distribution_func, None, str)
self._parms["custom_distribution_func"] = custom_distribution_func
@property
def export_checkpoints_dir(self):
"""
Automatically export generated models to this directory.
Type: ``str``.
:examples:
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> response = "IsDepDelayed"
>>> hyper_parameters = {'ntrees': [5,10]}
>>> search_crit = {'strategy': "RandomDiscrete",
... 'max_models': 5,
... 'seed': 1234,
... 'stopping_rounds': 3,
... 'stopping_metric': "AUTO",
... 'stopping_tolerance': 1e-2}
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
... hyper_params=hyper_parameters,
... search_criteria=search_crit)
>>> air_grid.train(x=predictors,
... y=response,
... training_frame=airlines,
... distribution="bernoulli",
... learn_rate=0.1,
... max_depth=3,
... export_checkpoints_dir=checkpoints_dir)
>>> len(listdir(checkpoints_dir))
"""
return self._parms.get("export_checkpoints_dir")
@export_checkpoints_dir.setter
def export_checkpoints_dir(self, export_checkpoints_dir):
assert_is_type(export_checkpoints_dir, None, str)
self._parms["export_checkpoints_dir"] = export_checkpoints_dir
@property
def in_training_checkpoints_dir(self):
"""
Create checkpoints into defined directory while training process is still running. In case of cluster shutdown,
this checkpoint can be used to restart training.
Type: ``str``.
"""
return self._parms.get("in_training_checkpoints_dir")
@in_training_checkpoints_dir.setter
def in_training_checkpoints_dir(self, in_training_checkpoints_dir):
assert_is_type(in_training_checkpoints_dir, None, str)
self._parms["in_training_checkpoints_dir"] = in_training_checkpoints_dir
@property
def in_training_checkpoints_tree_interval(self):
"""
Checkpoint the model after every so many trees. Parameter is used only when in_training_checkpoints_dir is
defined
Type: ``int``, defaults to ``1``.
"""
return self._parms.get("in_training_checkpoints_tree_interval")
@in_training_checkpoints_tree_interval.setter
def in_training_checkpoints_tree_interval(self, in_training_checkpoints_tree_interval):
assert_is_type(in_training_checkpoints_tree_interval, None, int)
self._parms["in_training_checkpoints_tree_interval"] = in_training_checkpoints_tree_interval
@property
def monotone_constraints(self):
"""
A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a
decreasing constraint.
Type: ``dict``.
:examples:
>>> prostate_hex = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate_hex["CAPSULE"] = prostate_hex["CAPSULE"].asfactor()
>>> response = "CAPSULE"
>>> seed = 42
>>> monotone_constraints = {"AGE":1}
>>> gbm_model = H2OGradientBoostingEstimator(seed=seed,
... monotone_constraints=monotone_constraints)
>>> gbm_model.train(y=response,
... ignored_columns=["ID"],
... training_frame=prostate_hex)
>>> gbm_model.scoring_history()
"""
return self._parms.get("monotone_constraints")
@monotone_constraints.setter
def monotone_constraints(self, monotone_constraints):
assert_is_type(monotone_constraints, None, dict)
self._parms["monotone_constraints"] = monotone_constraints
@property
def check_constant_response(self):
"""
Check if response column is constant. If enabled, then an exception is thrown if the response column is a
constant value.If disabled, then model will train regardless of the response column being a constant value or
not.
Type: ``bool``, defaults to ``True``.
:examples:
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> train["constantCol"] = 1
>>> my_gbm = H2OGradientBoostingEstimator(check_constant_response=False)
>>> my_gbm.train(x=list(range(1,5)),
... y="constantCol",
... training_frame=train)
"""
return self._parms.get("check_constant_response")
@check_constant_response.setter
def check_constant_response(self, check_constant_response):
assert_is_type(check_constant_response, None, bool)
self._parms["check_constant_response"] = check_constant_response
@property
def gainslift_bins(self):
"""
Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning.
Type: ``int``, defaults to ``-1``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=20)
>>> model.train(x=["Origin", "Distance"],
... y="IsDepDelayed",
... training_frame=airlines)
>>> model.gains_lift()
"""
return self._parms.get("gainslift_bins")
@gainslift_bins.setter
def gainslift_bins(self, gainslift_bins):
assert_is_type(gainslift_bins, None, int)
self._parms["gainslift_bins"] = gainslift_bins
@property
def auc_type(self):
"""
Set default multinomial AUC type.
Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to
``"auto"``.
"""
return self._parms.get("auc_type")
@auc_type.setter
def auc_type(self, auc_type):
assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"))
self._parms["auc_type"] = auc_type
@property
def interaction_constraints(self):
"""
A set of allowed column interactions.
Type: ``List[List[str]]``.
"""
return self._parms.get("interaction_constraints")
@interaction_constraints.setter
def interaction_constraints(self, interaction_constraints):
assert_is_type(interaction_constraints, None, [[str]])
self._parms["interaction_constraints"] = interaction_constraints
@property
def auto_rebalance(self):
"""
Allow automatic rebalancing of training and validation datasets
Type: ``bool``, defaults to ``True``.
"""
return self._parms.get("auto_rebalance")
@auto_rebalance.setter
def auto_rebalance(self, auto_rebalance):
assert_is_type(auto_rebalance, None, bool)
self._parms["auto_rebalance"] = auto_rebalance