#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric
[docs]class H2OGradientBoostingEstimator(H2OEstimator):
"""
Gradient Boosting Machine
Builds gradient boosted trees on a parsed data set, for regression or classification.
The default distribution function will guess the model type based on the response column type.
Otherwise, the response column must be an enum for "bernoulli" or "multinomial", and numeric
for all other distributions.
"""
algo = "gbm"
param_names = {"model_id", "training_frame", "validation_frame", "nfolds", "keep_cross_validation_models",
"keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "score_each_iteration",
"score_tree_interval", "fold_assignment", "fold_column", "response_column", "ignored_columns",
"ignore_const_cols", "offset_column", "weights_column", "balance_classes", "class_sampling_factors",
"max_after_balance_size", "max_confusion_matrix_size", "ntrees", "max_depth", "min_rows", "nbins",
"nbins_top_level", "nbins_cats", "r2_stopping", "stopping_rounds", "stopping_metric",
"stopping_tolerance", "max_runtime_secs", "seed", "build_tree_one_node", "learn_rate",
"learn_rate_annealing", "distribution", "quantile_alpha", "tweedie_power", "huber_alpha",
"checkpoint", "sample_rate", "sample_rate_per_class", "col_sample_rate",
"col_sample_rate_change_per_level", "col_sample_rate_per_tree", "min_split_improvement",
"histogram_type", "max_abs_leafnode_pred", "pred_noise_bandwidth", "categorical_encoding",
"calibrate_model", "calibration_frame", "custom_metric_func", "custom_distribution_func",
"export_checkpoints_dir", "monotone_constraints", "check_constant_response", "gainslift_bins",
"auc_type"}
def __init__(self, **kwargs):
super(H2OGradientBoostingEstimator, self).__init__()
self._parms = {}
for pname, pvalue in kwargs.items():
if pname == 'model_id':
self._id = pvalue
self._parms["model_id"] = pvalue
elif pname in self.param_names:
# Using setattr(...) will invoke type-checking of the arguments
setattr(self, pname, pvalue)
else:
raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
@property
def training_frame(self):
"""
Id of the training data frame.
Type: ``H2OFrame``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("training_frame")
@training_frame.setter
def training_frame(self, training_frame):
self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
@property
def validation_frame(self):
"""
Id of the validation data frame.
Type: ``H2OFrame``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("validation_frame")
@validation_frame.setter
def validation_frame(self, validation_frame):
self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
@property
def nfolds(self):
"""
Number of folds for K-fold cross-validation (0 to disable or >= 2).
Type: ``int`` (default: ``0``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> cars_gbm = H2OGradientBoostingEstimator(nfolds=folds,
... seed=1234
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=cars)
>>> cars_gbm.auc()
"""
return self._parms.get("nfolds")
@nfolds.setter
def nfolds(self, nfolds):
assert_is_type(nfolds, None, int)
self._parms["nfolds"] = nfolds
@property
def keep_cross_validation_models(self):
"""
Whether to keep the cross-validation models.
Type: ``bool`` (default: ``True``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_models=True,
... nfolds=5,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc()
"""
return self._parms.get("keep_cross_validation_models")
@keep_cross_validation_models.setter
def keep_cross_validation_models(self, keep_cross_validation_models):
assert_is_type(keep_cross_validation_models, None, bool)
self._parms["keep_cross_validation_models"] = keep_cross_validation_models
@property
def keep_cross_validation_predictions(self):
"""
Whether to keep the predictions of the cross-validation models.
Type: ``bool`` (default: ``False``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_predictions=True,
... nfolds=5,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc()
"""
return self._parms.get("keep_cross_validation_predictions")
@keep_cross_validation_predictions.setter
def keep_cross_validation_predictions(self, keep_cross_validation_predictions):
assert_is_type(keep_cross_validation_predictions, None, bool)
self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions
@property
def keep_cross_validation_fold_assignment(self):
"""
Whether to keep the cross-validation fold assignment.
Type: ``bool`` (default: ``False``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_fold_assignment=True,
... nfolds=5,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc()
"""
return self._parms.get("keep_cross_validation_fold_assignment")
@keep_cross_validation_fold_assignment.setter
def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment):
assert_is_type(keep_cross_validation_fold_assignment, None, bool)
self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment
@property
def score_each_iteration(self):
"""
Whether to score during each iteration of model training.
Type: ``bool`` (default: ``False``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(score_each_iteration=True,
... ntrees=55,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.scoring_history()
"""
return self._parms.get("score_each_iteration")
@score_each_iteration.setter
def score_each_iteration(self, score_each_iteration):
assert_is_type(score_each_iteration, None, bool)
self._parms["score_each_iteration"] = score_each_iteration
@property
def score_tree_interval(self):
"""
Score the model after every so many trees. Disabled if set to 0.
Type: ``int`` (default: ``0``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(score_tree_interval=True,
... ntrees=55,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.scoring_history()
"""
return self._parms.get("score_tree_interval")
@score_tree_interval.setter
def score_tree_interval(self, score_tree_interval):
assert_is_type(score_tree_interval, None, int)
self._parms["score_tree_interval"] = score_tree_interval
@property
def fold_assignment(self):
"""
Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
the folds based on the response variable, for classification problems.
One of: ``"auto"``, ``"random"``, ``"modulo"``, ``"stratified"`` (default: ``"auto"``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> assignment_type = "Random"
>>> cars_gbm = H2OGradientBoostingEstimator(fold_assignment=assignment_type,
... nfolds=5,
... seed=1234)
>>> cars_gbm.train(x=predictors, y=response, training_frame=cars)
>>> cars_gbm.auc(xval=True)
"""
return self._parms.get("fold_assignment")
@fold_assignment.setter
def fold_assignment(self, fold_assignment):
assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified"))
self._parms["fold_assignment"] = fold_assignment
@property
def fold_column(self):
"""
Column with cross-validation fold index assignment per observation.
Type: ``str``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> fold_numbers = cars.kfold_column(n_folds=5,
... seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=cars,
... fold_column="fold_numbers")
>>> cars_gbm.auc(xval=True)
"""
return self._parms.get("fold_column")
@fold_column.setter
def fold_column(self, fold_column):
assert_is_type(fold_column, None, str)
self._parms["fold_column"] = fold_column
@property
def response_column(self):
"""
Response variable column.
Type: ``str``.
"""
return self._parms.get("response_column")
@response_column.setter
def response_column(self, response_column):
assert_is_type(response_column, None, str)
self._parms["response_column"] = response_column
@property
def ignored_columns(self):
"""
Names of columns to ignore for training.
Type: ``List[str]``.
"""
return self._parms.get("ignored_columns")
@ignored_columns.setter
def ignored_columns(self, ignored_columns):
assert_is_type(ignored_columns, None, [str])
self._parms["ignored_columns"] = ignored_columns
@property
def ignore_const_cols(self):
"""
Ignore constant columns.
Type: ``bool`` (default: ``True``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234,
... ignore_const_cols=True)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("ignore_const_cols")
@ignore_const_cols.setter
def ignore_const_cols(self, ignore_const_cols):
assert_is_type(ignore_const_cols, None, bool)
self._parms["ignore_const_cols"] = ignore_const_cols
@property
def offset_column(self):
"""
Offset column. This will be added to the combination of columns before applying the link function.
Type: ``str``.
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> boston["offset"] = boston["medv"].log()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_gbm = H2OGradientBoostingEstimator(offset_column="offset",
... seed=1234)
>>> boston_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> boston_gbm.mse(valid=True)
"""
return self._parms.get("offset_column")
@offset_column.setter
def offset_column(self, offset_column):
assert_is_type(offset_column, None, str)
self._parms["offset_column"] = offset_column
@property
def weights_column(self):
"""
Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
During training, rows with higher weights matter more, due to the larger loss function pre-factor.
Type: ``str``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid,
... weights_column="weight")
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("weights_column")
@weights_column.setter
def weights_column(self, weights_column):
assert_is_type(weights_column, None, str)
self._parms["weights_column"] = weights_column
@property
def balance_classes(self):
"""
Balance training data class counts via over/under-sampling (for imbalanced data).
Type: ``bool`` (default: ``False``).
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("balance_classes")
@balance_classes.setter
def balance_classes(self, balance_classes):
assert_is_type(balance_classes, None, bool)
self._parms["balance_classes"] = balance_classes
@property
def class_sampling_factors(self):
"""
Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
be automatically computed to obtain class balance during training. Requires balance_classes.
Type: ``List[float]``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
... class_sampling_factors=sample_factors,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("class_sampling_factors")
@class_sampling_factors.setter
def class_sampling_factors(self, class_sampling_factors):
assert_is_type(class_sampling_factors, None, [float])
self._parms["class_sampling_factors"] = class_sampling_factors
@property
def max_after_balance_size(self):
"""
Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires
balance_classes.
Type: ``float`` (default: ``5``).
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> max = .85
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
... max_after_balance_size=max,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("max_after_balance_size")
@max_after_balance_size.setter
def max_after_balance_size(self, max_after_balance_size):
assert_is_type(max_after_balance_size, None, float)
self._parms["max_after_balance_size"] = max_after_balance_size
@property
def max_confusion_matrix_size(self):
"""
[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs
Type: ``int`` (default: ``20``).
"""
return self._parms.get("max_confusion_matrix_size")
@max_confusion_matrix_size.setter
def max_confusion_matrix_size(self, max_confusion_matrix_size):
assert_is_type(max_confusion_matrix_size, None, int)
self._parms["max_confusion_matrix_size"] = max_confusion_matrix_size
@property
def ntrees(self):
"""
Number of trees.
Type: ``int`` (default: ``50``).
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> tree_num = [20, 50, 80, 110, 140, 170, 200]
>>> label = ["20", "50", "80", "110", "140", "170", "200"]
>>> for key, num in enumerate(tree_num):
... titanic_gbm = H2OGradientBoostingEstimator(ntrees=num,
... seed=1234)
... titanic_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
... print(label[key], 'training score', titanic_gbm.auc(train=True))
... print(label[key], 'validation score', titanic_gbm.auc(valid=True))
"""
return self._parms.get("ntrees")
@ntrees.setter
def ntrees(self, ntrees):
assert_is_type(ntrees, None, int)
self._parms["ntrees"] = ntrees
@property
def max_depth(self):
"""
Maximum tree depth (0 for unlimited).
Type: ``int`` (default: ``5``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(ntrees=100,
... max_depth=2,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("max_depth")
@max_depth.setter
def max_depth(self, max_depth):
assert_is_type(max_depth, None, int)
self._parms["max_depth"] = max_depth
@property
def min_rows(self):
"""
Fewest allowed (weighted) observations in a leaf.
Type: ``float`` (default: ``10``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(min_rows=16,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("min_rows")
@min_rows.setter
def min_rows(self, min_rows):
assert_is_type(min_rows, None, numeric)
self._parms["min_rows"] = min_rows
@property
def nbins(self):
"""
For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point
Type: ``int`` (default: ``20``).
:examples:
>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [16, 32, 64, 128, 256, 512]
>>> label = ["16", "32", "64", "128", "256", "512"]
>>> for key, num in enumerate(bin_num):
... eeg_gbm = H2OGradientBoostingEstimator(nbins=num, seed=1234)
... eeg_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
... print(label[key], 'training score', eeg_gbm.auc(train=True))
... print(label[key], 'validation score', eeg_gbm.auc(valid=True))
"""
return self._parms.get("nbins")
@nbins.setter
def nbins(self, nbins):
assert_is_type(nbins, None, int)
self._parms["nbins"] = nbins
@property
def nbins_top_level(self):
"""
For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease
by factor of two per level
Type: ``int`` (default: ``1024``).
:examples:
>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [32, 64, 128, 256, 512, 1024, 2048, 4096]
>>> label = ["32", "64", "128", "256", "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
... eeg_gbm = H2OGradientBoostingEstimator(nbins_top_level=num, seed=1234)
... eeg_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
... print(label[key], 'training score', eeg_gbm.auc(train=True))
... print(label[key], 'validation score', eeg_gbm.auc(valid=True))
"""
return self._parms.get("nbins_top_level")
@nbins_top_level.setter
def nbins_top_level(self, nbins_top_level):
assert_is_type(nbins_top_level, None, int)
self._parms["nbins_top_level"] = nbins_top_level
@property
def nbins_cats(self):
"""
For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher
values can lead to more overfitting.
Type: ``int`` (default: ``1024``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
>>> label = ["8", "16", "32", "64", "128", "256", "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
... airlines_gbm = H2OGradientBoostingEstimator(nbins_cats=num, seed=1234)
... airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
... print(label[key], 'training score', airlines_gbm.auc(train=True))
... print(label[key], 'validation score', airlines_gbm.auc(valid=True))
"""
return self._parms.get("nbins_cats")
@nbins_cats.setter
def nbins_cats(self, nbins_cats):
assert_is_type(nbins_cats, None, int)
self._parms["nbins_cats"] = nbins_cats
@property
def r2_stopping(self):
"""
r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and
stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or
exceeds this
Type: ``float`` (default: ``1.797693135e+308``).
"""
return self._parms.get("r2_stopping")
@r2_stopping.setter
def r2_stopping(self, r2_stopping):
assert_is_type(r2_stopping, None, numeric)
self._parms["r2_stopping"] = r2_stopping
@property
def stopping_rounds(self):
"""
Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
Type: ``int`` (default: ``0``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("stopping_rounds")
@stopping_rounds.setter
def stopping_rounds(self, stopping_rounds):
assert_is_type(stopping_rounds, None, int)
self._parms["stopping_rounds"] = stopping_rounds
@property
def stopping_metric(self):
"""
Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score
for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python
client.
One of: ``"auto"``, ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``,
``"aucpr"``, ``"lift_top_group"``, ``"misclassification"``, ``"mean_per_class_error"``, ``"custom"``,
``"custom_increasing"`` (default: ``"auto"``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("stopping_metric")
@stopping_metric.setter
def stopping_metric(self, stopping_metric):
assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"))
self._parms["stopping_metric"] = stopping_metric
@property
def stopping_tolerance(self):
"""
Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)
Type: ``float`` (default: ``0.001``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("stopping_tolerance")
@stopping_tolerance.setter
def stopping_tolerance(self, stopping_tolerance):
assert_is_type(stopping_tolerance, None, numeric)
self._parms["stopping_tolerance"] = stopping_tolerance
@property
def max_runtime_secs(self):
"""
Maximum allowed runtime in seconds for model training. Use 0 to disable.
Type: ``float`` (default: ``0``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(max_runtime_secs=10,
... ntrees=10000,
... max_depth=10,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("max_runtime_secs")
@max_runtime_secs.setter
def max_runtime_secs(self, max_runtime_secs):
assert_is_type(max_runtime_secs, None, numeric)
self._parms["max_runtime_secs"] = max_runtime_secs
@property
def seed(self):
"""
Seed for pseudo random number generator (if applicable)
Type: ``int`` (default: ``-1``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> gbm_w_seed_1 = H2OGradientBoostingEstimator(col_sample_rate=.7,
... seed=1234)
>>> gbm_w_seed_1.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('auc for the 1st model built with a seed:', gbm_w_seed_1.auc(valid=True))
"""
return self._parms.get("seed")
@seed.setter
def seed(self, seed):
assert_is_type(seed, None, int)
self._parms["seed"] = seed
@property
def build_tree_one_node(self):
"""
Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.
Type: ``bool`` (default: ``False``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(build_tree_one_node=True,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("build_tree_one_node")
@build_tree_one_node.setter
def build_tree_one_node(self, build_tree_one_node):
assert_is_type(build_tree_one_node, None, bool)
self._parms["build_tree_one_node"] = build_tree_one_node
@property
def learn_rate(self):
"""
Learning rate (from 0.0 to 1.0)
Type: ``float`` (default: ``0.1``).
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(ntrees=10000,
... learn_rate=0.01,
... stopping_rounds=5,
... stopping_metric="AUC",
... stopping_tolerance=1e-4,
... seed=1234)
>>> titanic_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_gbm.auc(valid=True)
"""
return self._parms.get("learn_rate")
@learn_rate.setter
def learn_rate(self, learn_rate):
assert_is_type(learn_rate, None, numeric)
self._parms["learn_rate"] = learn_rate
@property
def learn_rate_annealing(self):
"""
Scale the learning rate by this factor after each tree (e.g., 0.99 or 0.999)
Type: ``float`` (default: ``1``).
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(ntrees=10000,
... learn_rate=0.05,
... learn_rate_annealing=.9,
... stopping_rounds=5,
... stopping_metric="AUC",
... stopping_tolerance=1e-4,
... seed=1234)
>>> titanic_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_gbm.auc(valid=True)
"""
return self._parms.get("learn_rate_annealing")
@learn_rate_annealing.setter
def learn_rate_annealing(self, learn_rate_annealing):
assert_is_type(learn_rate_annealing, None, numeric)
self._parms["learn_rate_annealing"] = learn_rate_annealing
@property
def distribution(self):
"""
Distribution function
One of: ``"auto"``, ``"bernoulli"``, ``"quasibinomial"``, ``"multinomial"``, ``"gaussian"``, ``"poisson"``,
``"gamma"``, ``"tweedie"``, ``"laplace"``, ``"quantile"``, ``"huber"``, ``"custom"`` (default: ``"auto"``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(distribution="poisson",
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.mse(valid=True)
"""
return self._parms.get("distribution")
@distribution.setter
def distribution(self, distribution):
assert_is_type(distribution, None, Enum("auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber", "custom"))
self._parms["distribution"] = distribution
@property
def quantile_alpha(self):
"""
Desired quantile for Quantile regression, must be between 0 and 1.
Type: ``float`` (default: ``0.5``).
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_gbm = H2OGradientBoostingEstimator(distribution="quantile",
... quantile_alpha=.8,
... seed=1234)
>>> boston_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> boston_gbm.mse(valid=True)
"""
return self._parms.get("quantile_alpha")
@quantile_alpha.setter
def quantile_alpha(self, quantile_alpha):
assert_is_type(quantile_alpha, None, numeric)
self._parms["quantile_alpha"] = quantile_alpha
@property
def tweedie_power(self):
"""
Tweedie power for Tweedie regression, must be between 1 and 2.
Type: ``float`` (default: ``1.5``).
:examples:
>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8], seed=1234)
>>> insurance_gbm = H2OGradientBoostingEstimator(distribution="tweedie",
... tweedie_power=1.2,
... seed=1234)
>>> insurance_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> insurance_gbm.mse(valid=True)
"""
return self._parms.get("tweedie_power")
@tweedie_power.setter
def tweedie_power(self, tweedie_power):
assert_is_type(tweedie_power, None, numeric)
self._parms["tweedie_power"] = tweedie_power
@property
def huber_alpha(self):
"""
Desired quantile for Huber/M-regression (threshold between quadratic and linear loss, must be between 0 and 1).
Type: ``float`` (default: ``0.9``).
:examples:
>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8], seed=1234)
>>> insurance_gbm = H2OGradientBoostingEstimator(distribution="huber",
... huber_alpha=0.9,
... seed=1234)
>>> insurance_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> insurance_gbm.mse(valid=True)
"""
return self._parms.get("huber_alpha")
@huber_alpha.setter
def huber_alpha(self, huber_alpha):
assert_is_type(huber_alpha, None, numeric)
self._parms["huber_alpha"] = huber_alpha
@property
def checkpoint(self):
"""
Model checkpoint to resume training with.
Type: ``str``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(ntrees=1,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cars_gbm.auc(valid=True))
>>> print("Number of trees built for cars_gbm model:", cars_gbm.ntrees)
>>> cars_gbm_continued = H2OGradientBoostingEstimator(checkpoint=cars_gbm.model_id,
... ntrees=50,
... seed=1234)
>>> cars_gbm_continued.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm_continued.auc(valid=True)
>>> print("Number of trees built for cars_gbm model:",cars_gbm_continued.ntrees)
"""
return self._parms.get("checkpoint")
@checkpoint.setter
def checkpoint(self, checkpoint):
assert_is_type(checkpoint, None, str, H2OEstimator)
self._parms["checkpoint"] = checkpoint
@property
def sample_rate(self):
"""
Row sample rate per tree (from 0.0 to 1.0)
Type: ``float`` (default: ``1``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(sample_rate=.7,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("sample_rate")
@sample_rate.setter
def sample_rate(self, sample_rate):
assert_is_type(sample_rate, None, numeric)
self._parms["sample_rate"] = sample_rate
@property
def sample_rate_per_class(self):
"""
A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree
Type: ``List[float]``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> rate_per_class_list = [1, .4, 1, 1, 1, 1, 1]
>>> cov_gbm = H2OGradientBoostingEstimator(sample_rate_per_class=rate_per_class_list,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("sample_rate_per_class")
@sample_rate_per_class.setter
def sample_rate_per_class(self, sample_rate_per_class):
assert_is_type(sample_rate_per_class, None, [numeric])
self._parms["sample_rate_per_class"] = sample_rate_per_class
@property
def col_sample_rate(self):
"""
Column sample rate (from 0.0 to 1.0)
Type: ``float`` (default: ``1``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate=.7,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("col_sample_rate")
@col_sample_rate.setter
def col_sample_rate(self, col_sample_rate):
assert_is_type(col_sample_rate, None, numeric)
self._parms["col_sample_rate"] = col_sample_rate
@property
def col_sample_rate_change_per_level(self):
"""
Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0)
Type: ``float`` (default: ``1``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate_change_per_level=.9,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("col_sample_rate_change_per_level")
@col_sample_rate_change_per_level.setter
def col_sample_rate_change_per_level(self, col_sample_rate_change_per_level):
assert_is_type(col_sample_rate_change_per_level, None, numeric)
self._parms["col_sample_rate_change_per_level"] = col_sample_rate_change_per_level
@property
def col_sample_rate_per_tree(self):
"""
Column sample rate per tree (from 0.0 to 1.0)
Type: ``float`` (default: ``1``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate_per_tree=.7,
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("col_sample_rate_per_tree")
@col_sample_rate_per_tree.setter
def col_sample_rate_per_tree(self, col_sample_rate_per_tree):
assert_is_type(col_sample_rate_per_tree, None, numeric)
self._parms["col_sample_rate_per_tree"] = col_sample_rate_per_tree
@property
def min_split_improvement(self):
"""
Minimum relative improvement in squared error reduction for a split to happen
Type: ``float`` (default: ``1e-05``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(min_split_improvement=1e-3,
... seed=1234)
>>> cars_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_gbm.auc(valid=True)
"""
return self._parms.get("min_split_improvement")
@min_split_improvement.setter
def min_split_improvement(self, min_split_improvement):
assert_is_type(min_split_improvement, None, numeric)
self._parms["min_split_improvement"] = min_split_improvement
@property
def histogram_type(self):
"""
What type of histogram to use for finding optimal split points
One of: ``"auto"``, ``"uniform_adaptive"``, ``"random"``, ``"quantiles_global"``, ``"round_robin"`` (default:
``"auto"``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(histogram_type="UniformAdaptive",
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("histogram_type")
@histogram_type.setter
def histogram_type(self, histogram_type):
assert_is_type(histogram_type, None, Enum("auto", "uniform_adaptive", "random", "quantiles_global", "round_robin"))
self._parms["histogram_type"] = histogram_type
@property
def max_abs_leafnode_pred(self):
"""
Maximum absolute value of a leaf node prediction
Type: ``float`` (default: ``1.797693135e+308``).
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_gbm = H2OGradientBoostingEstimator(max_abs_leafnode_pred=2,
... seed=1234)
>>> cov_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
"""
return self._parms.get("max_abs_leafnode_pred")
@max_abs_leafnode_pred.setter
def max_abs_leafnode_pred(self, max_abs_leafnode_pred):
assert_is_type(max_abs_leafnode_pred, None, numeric)
self._parms["max_abs_leafnode_pred"] = max_abs_leafnode_pred
@property
def pred_noise_bandwidth(self):
"""
Bandwidth (sigma) of Gaussian multiplicative noise ~N(1,sigma) for tree node predictions
Type: ``float`` (default: ``0``).
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(pred_noise_bandwidth=0.1,
... seed=1234)
>>> titanic_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_gbm.auc(valid = True)
"""
return self._parms.get("pred_noise_bandwidth")
@pred_noise_bandwidth.setter
def pred_noise_bandwidth(self, pred_noise_bandwidth):
assert_is_type(pred_noise_bandwidth, None, numeric)
self._parms["pred_noise_bandwidth"] = pred_noise_bandwidth
@property
def categorical_encoding(self):
"""
Encoding scheme for categorical features
One of: ``"auto"``, ``"enum"``, ``"one_hot_internal"``, ``"one_hot_explicit"``, ``"binary"``, ``"eigen"``,
``"label_encoder"``, ``"sort_by_response"``, ``"enum_limited"`` (default: ``"auto"``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(categorical_encoding="labelencoder",
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
"""
return self._parms.get("categorical_encoding")
@categorical_encoding.setter
def categorical_encoding(self, categorical_encoding):
assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
self._parms["categorical_encoding"] = categorical_encoding
@property
def calibrate_model(self):
"""
Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide more accurate estimates
of class probabilities.
Type: ``bool`` (default: ``False``).
:examples:
>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> response = 'Angaus'
>>> train, calib = ecology.split_frame(seed = 12354)
>>> predictors = ecology.columns[3:13]
>>> w = h2o.create_frame(binary_fraction=1,
... binary_ones_fraction=0.5,
... missing_fraction=0,
... rows=744, cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_gbm = H2OGradientBoostingEstimator(ntrees=10,
... max_depth=5,
... min_rows=10,
... learn_rate=0.1,
... distribution="multinomial",
... weights_column="weight",
... calibrate_model=True,
... calibration_frame=calib)
>>> ecology_gbm.train(x=predictors,
... y="Angaus",
... training_frame=train)
>>> ecology_gbm.auc()
"""
return self._parms.get("calibrate_model")
@calibrate_model.setter
def calibrate_model(self, calibrate_model):
assert_is_type(calibrate_model, None, bool)
self._parms["calibrate_model"] = calibrate_model
@property
def calibration_frame(self):
"""
Calibration frame for Platt Scaling
Type: ``H2OFrame``.
:examples:
>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> response = 'Angaus'
>>> predictors = ecology.columns[3:13]
>>> train, calib = ecology.split_frame(seed=12354)
>>> w = h2o.create_frame(binary_fraction=1,
... binary_ones_fraction=0.5,
... missing_fraction=0,
... rows=744,cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_gbm = H2OGradientBoostingEstimator(ntrees=10,
... max_depth=5,
... min_rows=10,
... learn_rate=0.1,
... distribution="multinomial",
... calibrate_model=True,
... calibration_frame=calib)
>>> ecology_gbm.train(x=predictors,
... y="Angaus",
... training_frame=train,
... weights_column="weight")
>>> ecology_gbm.auc()
"""
return self._parms.get("calibration_frame")
@calibration_frame.setter
def calibration_frame(self, calibration_frame):
self._parms["calibration_frame"] = H2OFrame._validate(calibration_frame, 'calibration_frame')
@property
def custom_metric_func(self):
"""
Reference to custom evaluation function, format: `language:keyName=funcName`
Type: ``str``.
"""
return self._parms.get("custom_metric_func")
@custom_metric_func.setter
def custom_metric_func(self, custom_metric_func):
assert_is_type(custom_metric_func, None, str)
self._parms["custom_metric_func"] = custom_metric_func
@property
def custom_distribution_func(self):
"""
Reference to custom distribution, format: `language:keyName=funcName`
Type: ``str``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(ntrees=3,
... max_depth=5,
... distribution="bernoulli",
... seed=1234)
>>> airlines_gbm.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame valid)
>>> from h2o.utils.distributions import CustomDistributionBernoulli
>>> custom_distribution_bernoulli = h2o.upload_custom_distribution(CustomDistributionBernoulli,
... func_name="custom_bernoulli",
... func_file="custom_bernoulli.py")
>>> airlines_gbm_custom = H2OGradientBoostingEstimator(ntrees=3,
... max_depth=5,
... distribution="custom",
... custom_distribution_func=custom_distribution_bernoulli,
... seed=1235)
>>> airlines_gbm_custom.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_gbm.auc()
"""
return self._parms.get("custom_distribution_func")
@custom_distribution_func.setter
def custom_distribution_func(self, custom_distribution_func):
assert_is_type(custom_distribution_func, None, str)
self._parms["custom_distribution_func"] = custom_distribution_func
@property
def export_checkpoints_dir(self):
"""
Automatically export generated models to this directory.
Type: ``str``.
:examples:
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> response = "IsDepDelayed"
>>> hyper_parameters = {'ntrees': [5,10]}
>>> search_crit = {'strategy': "RandomDiscrete",
... 'max_models': 5,
... 'seed': 1234,
... 'stopping_rounds': 3,
... 'stopping_metric': "AUTO",
... 'stopping_tolerance': 1e-2}
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
... hyper_params=hyper_parameters,
... search_criteria=search_crit)
>>> air_grid.train(x=predictors,
... y=response,
... training_frame=airlines,
... distribution="bernoulli",
... learn_rate=0.1,
... max_depth=3,
... export_checkpoints_dir=checkpoints_dir)
>>> len(listdir(checkpoints_dir))
"""
return self._parms.get("export_checkpoints_dir")
@export_checkpoints_dir.setter
def export_checkpoints_dir(self, export_checkpoints_dir):
assert_is_type(export_checkpoints_dir, None, str)
self._parms["export_checkpoints_dir"] = export_checkpoints_dir
@property
def monotone_constraints(self):
"""
A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a
decreasing constraint.
Type: ``dict``.
:examples:
>>> prostate_hex = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate_hex["CAPSULE"] = prostate_hex["CAPSULE"].asfactor()
>>> response = "CAPSULE"
>>> seed = 42
>>> monotone_constraints = {"AGE":1}
>>> gbm_model = H2OGradientBoostingEstimator(seed=seed,
... monotone_constraints=monotone_constraints)
>>> gbm_model.train(y=response,
... ignored_columns=["ID"],
... training_frame=prostate_hex)
>>> gbm_model.scoring_history()
"""
return self._parms.get("monotone_constraints")
@monotone_constraints.setter
def monotone_constraints(self, monotone_constraints):
assert_is_type(monotone_constraints, None, dict)
self._parms["monotone_constraints"] = monotone_constraints
@property
def check_constant_response(self):
"""
Check if response column is constant. If enabled, then an exception is thrown if the response column is a
constant value.If disabled, then model will train regardless of the response column being a constant value or
not.
Type: ``bool`` (default: ``True``).
:examples:
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> train["constantCol"] = 1
>>> my_gbm = H2OGradientBoostingEstimator(check_constant_response=False)
>>> my_gbm.train(x=list(range(1,5)),
... y="constantCol",
... training_frame=train)
"""
return self._parms.get("check_constant_response")
@check_constant_response.setter
def check_constant_response(self, check_constant_response):
assert_is_type(check_constant_response, None, bool)
self._parms["check_constant_response"] = check_constant_response
@property
def gainslift_bins(self):
"""
Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning.
Type: ``int`` (default: ``-1``).
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=20)
>>> model.train(x=["Origin", "Distance"],
... y="IsDepDelayed",
... training_frame=airlines)
>>> model.gains_lift()
"""
return self._parms.get("gainslift_bins")
@gainslift_bins.setter
def gainslift_bins(self, gainslift_bins):
assert_is_type(gainslift_bins, None, int)
self._parms["gainslift_bins"] = gainslift_bins
@property
def auc_type(self):
"""
Set default multinomial AUC type.
One of: ``"auto"``, ``"none"``, ``"macro_ovr"``, ``"weighted_ovr"``, ``"macro_ovo"``, ``"weighted_ovo"``
(default: ``"auto"``).
"""
return self._parms.get("auc_type")
@auc_type.setter
def auc_type(self, auc_type):
assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"))
self._parms["auc_type"] = auc_type