#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details)
#
import h2o
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric
[docs]class H2OXGBoostEstimator(H2OEstimator):
"""
XGBoost
Builds an eXtreme Gradient Boosting model using the native XGBoost backend.
"""
algo = "xgboost"
supervised_learning = True
_options_ = {'model_extensions': ['h2o.model.extensions.ScoringHistoryTrees',
'h2o.model.extensions.VariableImportance',
'h2o.model.extensions.FeatureInteraction',
'h2o.model.extensions.Trees',
'h2o.model.extensions.SupervisedTrees',
'h2o.model.extensions.HStatistic',
'h2o.model.extensions.Contributions',
'h2o.model.extensions.Fairness'],
'verbose': True}
def __init__(self,
model_id=None, # type: Optional[Union[None, str, H2OEstimator]]
training_frame=None, # type: Optional[Union[None, str, H2OFrame]]
validation_frame=None, # type: Optional[Union[None, str, H2OFrame]]
nfolds=0, # type: int
keep_cross_validation_models=True, # type: bool
keep_cross_validation_predictions=False, # type: bool
keep_cross_validation_fold_assignment=False, # type: bool
score_each_iteration=False, # type: bool
fold_assignment="auto", # type: Literal["auto", "random", "modulo", "stratified"]
fold_column=None, # type: Optional[str]
response_column=None, # type: Optional[str]
ignored_columns=None, # type: Optional[List[str]]
ignore_const_cols=True, # type: bool
offset_column=None, # type: Optional[str]
weights_column=None, # type: Optional[str]
stopping_rounds=0, # type: int
stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]
stopping_tolerance=0.001, # type: float
max_runtime_secs=0.0, # type: float
seed=-1, # type: int
distribution="auto", # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]
tweedie_power=1.5, # type: float
categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]
quiet_mode=True, # type: bool
checkpoint=None, # type: Optional[Union[None, str, H2OEstimator]]
export_checkpoints_dir=None, # type: Optional[str]
custom_metric_func=None, # type: Optional[str]
ntrees=50, # type: int
max_depth=6, # type: int
min_rows=1.0, # type: float
min_child_weight=1.0, # type: float
learn_rate=0.3, # type: float
eta=0.3, # type: float
sample_rate=1.0, # type: float
subsample=1.0, # type: float
col_sample_rate=1.0, # type: float
colsample_bylevel=1.0, # type: float
col_sample_rate_per_tree=1.0, # type: float
colsample_bytree=1.0, # type: float
colsample_bynode=1.0, # type: float
max_abs_leafnode_pred=0.0, # type: float
max_delta_step=0.0, # type: float
monotone_constraints=None, # type: Optional[dict]
interaction_constraints=None, # type: Optional[List[List[str]]]
score_tree_interval=0, # type: int
min_split_improvement=0.0, # type: float
gamma=0.0, # type: float
nthread=-1, # type: int
save_matrix_directory=None, # type: Optional[str]
build_tree_one_node=False, # type: bool
parallelize_cross_validation=True, # type: bool
calibrate_model=False, # type: bool
calibration_frame=None, # type: Optional[Union[None, str, H2OFrame]]
calibration_method="auto", # type: Literal["auto", "platt_scaling", "isotonic_regression"]
max_bins=256, # type: int
max_leaves=0, # type: int
sample_type="uniform", # type: Literal["uniform", "weighted"]
normalize_type="tree", # type: Literal["tree", "forest"]
rate_drop=0.0, # type: float
one_drop=False, # type: bool
skip_drop=0.0, # type: float
tree_method="auto", # type: Literal["auto", "exact", "approx", "hist"]
grow_policy="depthwise", # type: Literal["depthwise", "lossguide"]
booster="gbtree", # type: Literal["gbtree", "gblinear", "dart"]
reg_lambda=1.0, # type: float
reg_alpha=0.0, # type: float
dmatrix_type="auto", # type: Literal["auto", "dense", "sparse"]
backend="auto", # type: Literal["auto", "gpu", "cpu"]
gpu_id=None, # type: Optional[List[int]]
gainslift_bins=-1, # type: int
auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
scale_pos_weight=1.0, # type: float
eval_metric=None, # type: Optional[str]
score_eval_metric_only=False, # type: bool
):
"""
:param model_id: Destination id for this model; auto-generated if not specified.
Defaults to ``None``.
:type model_id: Union[None, str, H2OEstimator], optional
:param training_frame: Id of the training data frame.
Defaults to ``None``.
:type training_frame: Union[None, str, H2OFrame], optional
:param validation_frame: Id of the validation data frame.
Defaults to ``None``.
:type validation_frame: Union[None, str, H2OFrame], optional
:param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2).
Defaults to ``0``.
:type nfolds: int
:param keep_cross_validation_models: Whether to keep the cross-validation models.
Defaults to ``True``.
:type keep_cross_validation_models: bool
:param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models.
Defaults to ``False``.
:type keep_cross_validation_predictions: bool
:param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment.
Defaults to ``False``.
:type keep_cross_validation_fold_assignment: bool
:param score_each_iteration: Whether to score during each iteration of model training.
Defaults to ``False``.
:type score_each_iteration: bool
:param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The
'Stratified' option will stratify the folds based on the response variable, for classification problems.
Defaults to ``"auto"``.
:type fold_assignment: Literal["auto", "random", "modulo", "stratified"]
:param fold_column: Column with cross-validation fold index assignment per observation.
Defaults to ``None``.
:type fold_column: str, optional
:param response_column: Response variable column.
Defaults to ``None``.
:type response_column: str, optional
:param ignored_columns: Names of columns to ignore for training.
Defaults to ``None``.
:type ignored_columns: List[str], optional
:param ignore_const_cols: Ignore constant columns.
Defaults to ``True``.
:type ignore_const_cols: bool
:param offset_column: Offset column. This will be added to the combination of columns before applying the link
function.
Defaults to ``None``.
:type offset_column: str, optional
:param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
not increase the size of the data frame. This is typically the number of times a row is repeated, but
non-integer values are supported as well. During training, rows with higher weights matter more, due to
the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
Defaults to ``None``.
:type weights_column: str, optional
:param stopping_rounds: Early stopping based on convergence of stopping_metric. Stop if simple moving average of
length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
Defaults to ``0``.
:type stopping_rounds: int
:param stopping_metric: Metric to use for early stopping (AUTO: logloss for classification, deviance for
regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be
used in GBM and DRF with the Python client.
Defaults to ``"auto"``.
:type stopping_metric: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group",
"misclassification", "mean_per_class_error", "custom", "custom_increasing"]
:param stopping_tolerance: Relative tolerance for metric-based stopping criterion (stop if relative improvement
is not at least this much)
Defaults to ``0.001``.
:type stopping_tolerance: float
:param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
Defaults to ``0.0``.
:type max_runtime_secs: float
:param seed: Seed for pseudo random number generator (if applicable)
Defaults to ``-1``.
:type seed: int
:param distribution: Distribution function
Defaults to ``"auto"``.
:type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
"quantile", "huber"]
:param tweedie_power: Tweedie power for Tweedie regression, must be between 1 and 2.
Defaults to ``1.5``.
:type tweedie_power: float
:param categorical_encoding: Encoding scheme for categorical features
Defaults to ``"auto"``.
:type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
"sort_by_response", "enum_limited"]
:param quiet_mode: Enable quiet mode
Defaults to ``True``.
:type quiet_mode: bool
:param checkpoint: Model checkpoint to resume training with.
Defaults to ``None``.
:type checkpoint: Union[None, str, H2OEstimator], optional
:param export_checkpoints_dir: Automatically export generated models to this directory.
Defaults to ``None``.
:type export_checkpoints_dir: str, optional
:param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName`
Defaults to ``None``.
:type custom_metric_func: str, optional
:param ntrees: (same as n_estimators) Number of trees.
Defaults to ``50``.
:type ntrees: int
:param max_depth: Maximum tree depth (0 for unlimited).
Defaults to ``6``.
:type max_depth: int
:param min_rows: (same as min_child_weight) Fewest allowed (weighted) observations in a leaf.
Defaults to ``1.0``.
:type min_rows: float
:param min_child_weight: (same as min_rows) Fewest allowed (weighted) observations in a leaf.
Defaults to ``1.0``.
:type min_child_weight: float
:param learn_rate: (same as eta) Learning rate (from 0.0 to 1.0)
Defaults to ``0.3``.
:type learn_rate: float
:param eta: (same as learn_rate) Learning rate (from 0.0 to 1.0)
Defaults to ``0.3``.
:type eta: float
:param sample_rate: (same as subsample) Row sample rate per tree (from 0.0 to 1.0)
Defaults to ``1.0``.
:type sample_rate: float
:param subsample: (same as sample_rate) Row sample rate per tree (from 0.0 to 1.0)
Defaults to ``1.0``.
:type subsample: float
:param col_sample_rate: (same as colsample_bylevel) Column sample rate (from 0.0 to 1.0)
Defaults to ``1.0``.
:type col_sample_rate: float
:param colsample_bylevel: (same as col_sample_rate) Column sample rate (from 0.0 to 1.0)
Defaults to ``1.0``.
:type colsample_bylevel: float
:param col_sample_rate_per_tree: (same as colsample_bytree) Column sample rate per tree (from 0.0 to 1.0)
Defaults to ``1.0``.
:type col_sample_rate_per_tree: float
:param colsample_bytree: (same as col_sample_rate_per_tree) Column sample rate per tree (from 0.0 to 1.0)
Defaults to ``1.0``.
:type colsample_bytree: float
:param colsample_bynode: Column sample rate per tree node (from 0.0 to 1.0)
Defaults to ``1.0``.
:type colsample_bynode: float
:param max_abs_leafnode_pred: (same as max_delta_step) Maximum absolute value of a leaf node prediction
Defaults to ``0.0``.
:type max_abs_leafnode_pred: float
:param max_delta_step: (same as max_abs_leafnode_pred) Maximum absolute value of a leaf node prediction
Defaults to ``0.0``.
:type max_delta_step: float
:param monotone_constraints: A mapping representing monotonic constraints. Use +1 to enforce an increasing
constraint and -1 to specify a decreasing constraint.
Defaults to ``None``.
:type monotone_constraints: dict, optional
:param interaction_constraints: A set of allowed column interactions.
Defaults to ``None``.
:type interaction_constraints: List[List[str]], optional
:param score_tree_interval: Score the model after every so many trees. Disabled if set to 0.
Defaults to ``0``.
:type score_tree_interval: int
:param min_split_improvement: (same as gamma) Minimum relative improvement in squared error reduction for a
split to happen
Defaults to ``0.0``.
:type min_split_improvement: float
:param gamma: (same as min_split_improvement) Minimum relative improvement in squared error reduction for a
split to happen
Defaults to ``0.0``.
:type gamma: float
:param nthread: Number of parallel threads that can be used to run XGBoost. Cannot exceed H2O cluster limits
(-nthreads parameter). Defaults to maximum available
Defaults to ``-1``.
:type nthread: int
:param save_matrix_directory: Directory where to save matrices passed to XGBoost library. Useful for debugging.
Defaults to ``None``.
:type save_matrix_directory: str, optional
:param build_tree_one_node: Run on one node only; no network overhead but fewer cpus used. Suitable for small
datasets.
Defaults to ``False``.
:type build_tree_one_node: bool
:param parallelize_cross_validation: Allow parallel training of cross-validation models
Defaults to ``True``.
:type parallelize_cross_validation: bool
:param calibrate_model: Use Platt Scaling (default) or Isotonic Regression to calculate calibrated class
probabilities. Calibration can provide more accurate estimates of class probabilities.
Defaults to ``False``.
:type calibrate_model: bool
:param calibration_frame: Data for model calibration
Defaults to ``None``.
:type calibration_frame: Union[None, str, H2OFrame], optional
:param calibration_method: Calibration method to use
Defaults to ``"auto"``.
:type calibration_method: Literal["auto", "platt_scaling", "isotonic_regression"]
:param max_bins: For tree_method=hist only: maximum number of bins
Defaults to ``256``.
:type max_bins: int
:param max_leaves: For tree_method=hist only: maximum number of leaves
Defaults to ``0``.
:type max_leaves: int
:param sample_type: For booster=dart only: sample_type
Defaults to ``"uniform"``.
:type sample_type: Literal["uniform", "weighted"]
:param normalize_type: For booster=dart only: normalize_type
Defaults to ``"tree"``.
:type normalize_type: Literal["tree", "forest"]
:param rate_drop: For booster=dart only: rate_drop (0..1)
Defaults to ``0.0``.
:type rate_drop: float
:param one_drop: For booster=dart only: one_drop
Defaults to ``False``.
:type one_drop: bool
:param skip_drop: For booster=dart only: skip_drop (0..1)
Defaults to ``0.0``.
:type skip_drop: float
:param tree_method: Tree method
Defaults to ``"auto"``.
:type tree_method: Literal["auto", "exact", "approx", "hist"]
:param grow_policy: Grow policy - depthwise is standard GBM, lossguide is LightGBM
Defaults to ``"depthwise"``.
:type grow_policy: Literal["depthwise", "lossguide"]
:param booster: Booster type
Defaults to ``"gbtree"``.
:type booster: Literal["gbtree", "gblinear", "dart"]
:param reg_lambda: L2 regularization
Defaults to ``1.0``.
:type reg_lambda: float
:param reg_alpha: L1 regularization
Defaults to ``0.0``.
:type reg_alpha: float
:param dmatrix_type: Type of DMatrix. For sparse, NAs and 0 are treated equally.
Defaults to ``"auto"``.
:type dmatrix_type: Literal["auto", "dense", "sparse"]
:param backend: Backend. By default (auto), a GPU is used if available.
Defaults to ``"auto"``.
:type backend: Literal["auto", "gpu", "cpu"]
:param gpu_id: Which GPU(s) to use.
Defaults to ``None``.
:type gpu_id: List[int], optional
:param gainslift_bins: Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic
binning.
Defaults to ``-1``.
:type gainslift_bins: int
:param auc_type: Set default multinomial AUC type.
Defaults to ``"auto"``.
:type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
:param scale_pos_weight: Controls the effect of observations with positive labels in relation to the
observations with negative labels on gradient calculation. Useful for imbalanced problems.
Defaults to ``1.0``.
:type scale_pos_weight: float
:param eval_metric: Specification of evaluation metric that will be passed to the native XGBoost backend.
Defaults to ``None``.
:type eval_metric: str, optional
:param score_eval_metric_only: If enabled, score only the evaluation metric. This can make model training faster
if scoring is frequent (eg. each iteration).
Defaults to ``False``.
:type score_eval_metric_only: bool
"""
super(H2OXGBoostEstimator, self).__init__()
self._parms = {}
self._id = self._parms['model_id'] = model_id
self.training_frame = training_frame
self.validation_frame = validation_frame
self.nfolds = nfolds
self.keep_cross_validation_models = keep_cross_validation_models
self.keep_cross_validation_predictions = keep_cross_validation_predictions
self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment
self.score_each_iteration = score_each_iteration
self.fold_assignment = fold_assignment
self.fold_column = fold_column
self.response_column = response_column
self.ignored_columns = ignored_columns
self.ignore_const_cols = ignore_const_cols
self.offset_column = offset_column
self.weights_column = weights_column
self.stopping_rounds = stopping_rounds
self.stopping_metric = stopping_metric
self.stopping_tolerance = stopping_tolerance
self.max_runtime_secs = max_runtime_secs
self.seed = seed
self.distribution = distribution
self.tweedie_power = tweedie_power
self.categorical_encoding = categorical_encoding
self.quiet_mode = quiet_mode
self.checkpoint = checkpoint
self.export_checkpoints_dir = export_checkpoints_dir
self.custom_metric_func = custom_metric_func
self.ntrees = ntrees
self.max_depth = max_depth
self.min_rows = min_rows
self.min_child_weight = min_child_weight
self.learn_rate = learn_rate
self.eta = eta
self.sample_rate = sample_rate
self.subsample = subsample
self.col_sample_rate = col_sample_rate
self.colsample_bylevel = colsample_bylevel
self.col_sample_rate_per_tree = col_sample_rate_per_tree
self.colsample_bytree = colsample_bytree
self.colsample_bynode = colsample_bynode
self.max_abs_leafnode_pred = max_abs_leafnode_pred
self.max_delta_step = max_delta_step
self.monotone_constraints = monotone_constraints
self.interaction_constraints = interaction_constraints
self.score_tree_interval = score_tree_interval
self.min_split_improvement = min_split_improvement
self.gamma = gamma
self.nthread = nthread
self.save_matrix_directory = save_matrix_directory
self.build_tree_one_node = build_tree_one_node
self.parallelize_cross_validation = parallelize_cross_validation
self.calibrate_model = calibrate_model
self.calibration_frame = calibration_frame
self.calibration_method = calibration_method
self.max_bins = max_bins
self.max_leaves = max_leaves
self.sample_type = sample_type
self.normalize_type = normalize_type
self.rate_drop = rate_drop
self.one_drop = one_drop
self.skip_drop = skip_drop
self.tree_method = tree_method
self.grow_policy = grow_policy
self.booster = booster
self.reg_lambda = reg_lambda
self.reg_alpha = reg_alpha
self.dmatrix_type = dmatrix_type
self.backend = backend
self.gpu_id = gpu_id
self.gainslift_bins = gainslift_bins
self.auc_type = auc_type
self.scale_pos_weight = scale_pos_weight
self.eval_metric = eval_metric
self.score_eval_metric_only = score_eval_metric_only
@property
def training_frame(self):
"""
Id of the training data frame.
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_xgb.auc(valid=True)
"""
return self._parms.get("training_frame")
@training_frame.setter
def training_frame(self, training_frame):
self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
@property
def validation_frame(self):
"""
Id of the validation data frame.
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> train, valid = insurance.split_frame(ratios=[.8],
... seed=1234)
>>> insurance_xgb = H2OXGBoostEstimator(seed=1234)
>>> insurance_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(insurance_xgb.mse(valid=True))
"""
return self._parms.get("validation_frame")
@validation_frame.setter
def validation_frame(self, validation_frame):
self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
@property
def nfolds(self):
"""
Number of folds for K-fold cross-validation (0 to disable or >= 2).
Type: ``int``, defaults to ``0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> folds = 5
>>> titanic_xgb = H2OXGBoostEstimator(nfolds=folds,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=titanic)
>>> titanic_xgb.auc(xval=True)
"""
return self._parms.get("nfolds")
@nfolds.setter
def nfolds(self, nfolds):
assert_is_type(nfolds, None, int)
self._parms["nfolds"] = nfolds
@property
def keep_cross_validation_models(self):
"""
Whether to keep the cross-validation models.
Type: ``bool``, defaults to ``True``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_models=True,
... nfolds=5 ,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train)
>>> titanic_xgb.cross_validation_models()
"""
return self._parms.get("keep_cross_validation_models")
@keep_cross_validation_models.setter
def keep_cross_validation_models(self, keep_cross_validation_models):
assert_is_type(keep_cross_validation_models, None, bool)
self._parms["keep_cross_validation_models"] = keep_cross_validation_models
@property
def keep_cross_validation_predictions(self):
"""
Whether to keep the predictions of the cross-validation models.
Type: ``bool``, defaults to ``False``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_predictions=True,
... nfolds=5,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train)
>>> titanic_xgb.cross_validation_predictions()
"""
return self._parms.get("keep_cross_validation_predictions")
@keep_cross_validation_predictions.setter
def keep_cross_validation_predictions(self, keep_cross_validation_predictions):
assert_is_type(keep_cross_validation_predictions, None, bool)
self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions
@property
def keep_cross_validation_fold_assignment(self):
"""
Whether to keep the cross-validation fold assignment.
Type: ``bool``, defaults to ``False``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_fold_assignment=True,
... nfolds=5,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train)
>>> titanic_xgb.cross_validation_fold_assignment()
"""
return self._parms.get("keep_cross_validation_fold_assignment")
@keep_cross_validation_fold_assignment.setter
def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment):
assert_is_type(keep_cross_validation_fold_assignment, None, bool)
self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment
@property
def score_each_iteration(self):
"""
Whether to score during each iteration of model training.
Type: ``bool``, defaults to ``False``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(score_each_iteration=True,
... ntrees=55,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_xgb.scoring_history()
"""
return self._parms.get("score_each_iteration")
@score_each_iteration.setter
def score_each_iteration(self, score_each_iteration):
assert_is_type(score_each_iteration, None, bool)
self._parms["score_each_iteration"] = score_each_iteration
@property
def fold_assignment(self):
"""
Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
the folds based on the response variable, for classification problems.
Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> assignment_type = "Random"
>>> titanic_xgb = H2OXGBoostEstimator(fold_assignment=assignment_type,
... nfolds=5,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=titanic)
>>> titanic_xgb.auc(xval=True)
"""
return self._parms.get("fold_assignment")
@fold_assignment.setter
def fold_assignment(self, fold_assignment):
assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified"))
self._parms["fold_assignment"] = fold_assignment
@property
def fold_column(self):
"""
Column with cross-validation fold index assignment per observation.
Type: ``str``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> fold_numbers = titanic.kfold_column(n_folds=5,
... seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> titanic = titanic.cbind(fold_numbers)
>>> print(titanic['fold_numbers'])
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=titanic,
... fold_column="fold_numbers")
>>> titanic_xgb.auc(xval=True)
"""
return self._parms.get("fold_column")
@fold_column.setter
def fold_column(self, fold_column):
assert_is_type(fold_column, None, str)
self._parms["fold_column"] = fold_column
@property
def response_column(self):
"""
Response variable column.
Type: ``str``.
"""
return self._parms.get("response_column")
@response_column.setter
def response_column(self, response_column):
assert_is_type(response_column, None, str)
self._parms["response_column"] = response_column
@property
def ignored_columns(self):
"""
Names of columns to ignore for training.
Type: ``List[str]``.
"""
return self._parms.get("ignored_columns")
@ignored_columns.setter
def ignored_columns(self, ignored_columns):
assert_is_type(ignored_columns, None, [str])
self._parms["ignored_columns"] = ignored_columns
@property
def ignore_const_cols(self):
"""
Ignore constant columns.
Type: ``bool``, defaults to ``True``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> titanic["const_1"] = 6
>>> titanic["const_2"] = 7
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234,
... ignore_const_cols=True)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_xgb.auc(valid=True)
"""
return self._parms.get("ignore_const_cols")
@ignore_const_cols.setter
def ignore_const_cols(self, ignore_const_cols):
assert_is_type(ignore_const_cols, None, bool)
self._parms["ignore_const_cols"] = ignore_const_cols
@property
def offset_column(self):
"""
Offset column. This will be added to the combination of columns before applying the link function.
Type: ``str``.
"""
return self._parms.get("offset_column")
@offset_column.setter
def offset_column(self, offset_column):
assert_is_type(offset_column, None, str)
self._parms["offset_column"] = offset_column
@property
def weights_column(self):
"""
Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
accurate prediction, remove all rows with weight == 0.
Type: ``str``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_xgb.auc(valid=True)
"""
return self._parms.get("weights_column")
@weights_column.setter
def weights_column(self, weights_column):
assert_is_type(weights_column, None, str)
self._parms["weights_column"] = weights_column
@property
def stopping_rounds(self):
"""
Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
Type: ``int``, defaults to ``0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_xgb.auc(valid=True)
"""
return self._parms.get("stopping_rounds")
@stopping_rounds.setter
def stopping_rounds(self, stopping_rounds):
assert_is_type(stopping_rounds, None, int)
self._parms["stopping_rounds"] = stopping_rounds
@property
def stopping_metric(self):
"""
Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score
for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python
client.
Type: ``Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group",
"misclassification", "mean_per_class_error", "custom", "custom_increasing"]``, defaults to ``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_xgb.auc(valid=True)
"""
return self._parms.get("stopping_metric")
@stopping_metric.setter
def stopping_metric(self, stopping_metric):
assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"))
self._parms["stopping_metric"] = stopping_metric
@property
def stopping_tolerance(self):
"""
Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)
Type: ``float``, defaults to ``0.001``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_xgb.auc(valid=True)
"""
return self._parms.get("stopping_tolerance")
@stopping_tolerance.setter
def stopping_tolerance(self, stopping_tolerance):
assert_is_type(stopping_tolerance, None, numeric)
self._parms["stopping_tolerance"] = stopping_tolerance
@property
def max_runtime_secs(self):
"""
Maximum allowed runtime in seconds for model training. Use 0 to disable.
Type: ``float``, defaults to ``0.0``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
... seed=1234)
>>> cov_xgb = H2OXGBoostEstimator(max_runtime_secs=10,
... ntrees=10000,
... max_depth=10,
... seed=1234)
>>> cov_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cov_xgb.logloss(valid=True))
"""
return self._parms.get("max_runtime_secs")
@max_runtime_secs.setter
def max_runtime_secs(self, max_runtime_secs):
assert_is_type(max_runtime_secs, None, numeric)
self._parms["max_runtime_secs"] = max_runtime_secs
@property
def seed(self):
"""
Seed for pseudo random number generator (if applicable)
Type: ``int``, defaults to ``-1``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> xgb_w_seed_1 = H2OXGBoostEstimator(col_sample_rate=.7,
... seed=1234)
>>> xgb_w_seed_1.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> xgb_w_seed_2 = H2OXGBoostEstimator(col_sample_rate = .7,
... seed = 1234)
>>> xgb_w_seed_2.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('auc for the 1st model built with a seed:',
... xgb_w_seed_1.auc(valid=True))
>>> print('auc for the 2nd model built with a seed:',
... xgb_w_seed_2.auc(valid=True))
"""
return self._parms.get("seed")
@seed.setter
def seed(self, seed):
assert_is_type(seed, None, int)
self._parms["seed"] = seed
@property
def distribution(self):
"""
Distribution function
Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
"quantile", "huber"]``, defaults to ``"auto"``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_xgb = H2OXGBoostEstimator(distribution="poisson",
... seed=1234)
>>> cars_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_xgb.mse(valid=True)
"""
return self._parms.get("distribution")
@distribution.setter
def distribution(self, distribution):
assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"))
self._parms["distribution"] = distribution
@property
def tweedie_power(self):
"""
Tweedie power for Tweedie regression, must be between 1 and 2.
Type: ``float``, defaults to ``1.5``.
:examples:
>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8],
... seed=1234)
>>> insurance_xgb = H2OXGBoostEstimator(distribution="tweedie",
... tweedie_power=1.2,
... seed=1234)
>>> insurance_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(insurance_xgb.mse(valid=True))
"""
return self._parms.get("tweedie_power")
@tweedie_power.setter
def tweedie_power(self, tweedie_power):
assert_is_type(tweedie_power, None, numeric)
self._parms["tweedie_power"] = tweedie_power
@property
def categorical_encoding(self):
"""
Encoding scheme for categorical features
Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
"sort_by_response", "enum_limited"]``, defaults to ``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> encoding = "one_hot_explicit"
>>> airlines_xgb = H2OXGBoostEstimator(categorical_encoding=encoding,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_xgb.auc(valid=True)
"""
return self._parms.get("categorical_encoding")
@categorical_encoding.setter
def categorical_encoding(self, categorical_encoding):
assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
self._parms["categorical_encoding"] = categorical_encoding
@property
def quiet_mode(self):
"""
Enable quiet mode
Type: ``bool``, defaults to ``True``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234, quiet_mode=True)
>>> titanic_xgb.train(x=predictors
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_xgb.mse(valid=True)
"""
return self._parms.get("quiet_mode")
@quiet_mode.setter
def quiet_mode(self, quiet_mode):
assert_is_type(quiet_mode, None, bool)
self._parms["quiet_mode"] = quiet_mode
@property
def checkpoint(self):
"""
Model checkpoint to resume training with.
Type: ``Union[None, str, H2OEstimator]``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","year","economy_20mpg"]
>>> response = "acceleration"
>>> from h2o.estimators import H2OXGBoostEstimator
>>> cars_xgb = H2OXGBoostEstimator(seed=1234)
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_xgb.mse()
>>> cars_xgb_continued = H2OXGBoostEstimator(checkpoint=cars_xgb.model_id,
... ntrees=51,
... seed=1234)
>>> cars_xgb_continued.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_xgb_continued.mse()
"""
return self._parms.get("checkpoint")
@checkpoint.setter
def checkpoint(self, checkpoint):
assert_is_type(checkpoint, None, str, H2OEstimator)
self._parms["checkpoint"] = checkpoint
@property
def export_checkpoints_dir(self):
"""
Automatically export generated models to this directory.
Type: ``str``.
:examples:
>>> import tempfile
>>> from h2o.grid.grid_search import H2OGridSearch
>>> from os import listdir
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> response = "IsDepDelayed"
>>> hyper_parameters = {'ntrees': [5,10]}
>>> search_crit = {'strategy': "RandomDiscrete",
... 'max_models': 5,
... 'seed': 1234,
... 'stopping_rounds': 3,
... 'stopping_metric': "AUTO",
... 'stopping_tolerance': 1e-2}
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_grid = H2OGridSearch(H2OXGBoostEstimator,
... hyper_params=hyper_parameters,
... search_criteria=search_crit)
>>> air_grid.train(x=predictors,
... y=response,
... training_frame=airlines,
... distribution="bernoulli",
... learn_rate=0.1,
... max_depth=3,
... export_checkpoints_dir=checkpoints_dir)
>>> len(listdir(checkpoints_dir))
"""
return self._parms.get("export_checkpoints_dir")
@export_checkpoints_dir.setter
def export_checkpoints_dir(self, export_checkpoints_dir):
assert_is_type(export_checkpoints_dir, None, str)
self._parms["export_checkpoints_dir"] = export_checkpoints_dir
@property
def custom_metric_func(self):
"""
Reference to custom evaluation function, format: `language:keyName=funcName`
Type: ``str``.
"""
return self._parms.get("custom_metric_func")
@custom_metric_func.setter
def custom_metric_func(self, custom_metric_func):
assert_is_type(custom_metric_func, None, str)
self._parms["custom_metric_func"] = custom_metric_func
@property
def ntrees(self):
"""
(same as n_estimators) Number of trees.
Type: ``int``, defaults to ``50``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> tree_num = [20, 50, 80, 110, 140, 170, 200]
>>> label = ["20", "50", "80", "110",
... "140", "170", "200"]
>>> for key, num in enumerate(tree_num):
# Input integer for 'num' and 'key'
>>> titanic_xgb = H2OXGBoostEstimator(ntrees=num,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(label[key], 'training score',
... titanic_xgb.auc(train=True))
>>> print(label[key], 'validation score',
... titanic_xgb.auc(valid=True))
"""
return self._parms.get("ntrees")
@ntrees.setter
def ntrees(self, ntrees):
assert_is_type(ntrees, None, int)
self._parms["ntrees"] = ntrees
@property
def max_depth(self):
"""
Maximum tree depth (0 for unlimited).
Type: ``int``, defaults to ``6``.
:examples:
>>> df = h2o.import_file(path = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> response = "survived"
>>> df[response] = df[response].asfactor()
>>> predictors = df.columns
>>> del predictors[1:3]
>>> train, valid, test = df.split_frame(ratios=[0.6,0.2],
... seed=1234,
... destination_frames=
... ['train.hex',
... 'valid.hex',
... 'test.hex'])
>>> xgb = H2OXGBoostEstimator()
>>> xgb.train(x=predictors,
... y=response,
... training_frame=train)
>>> perf = xgb.model_performance(valid)
>>> print(perf.auc())
"""
return self._parms.get("max_depth")
@max_depth.setter
def max_depth(self, max_depth):
assert_is_type(max_depth, None, int)
self._parms["max_depth"] = max_depth
@property
def min_rows(self):
"""
(same as min_child_weight) Fewest allowed (weighted) observations in a leaf.
Type: ``float``, defaults to ``1.0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(min_rows=16,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("min_rows")
@min_rows.setter
def min_rows(self, min_rows):
assert_is_type(min_rows, None, numeric)
self._parms["min_rows"] = min_rows
@property
def min_child_weight(self):
"""
(same as min_rows) Fewest allowed (weighted) observations in a leaf.
Type: ``float``, defaults to ``1.0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(min_child_weight=16,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("min_child_weight")
@min_child_weight.setter
def min_child_weight(self, min_child_weight):
assert_is_type(min_child_weight, None, numeric)
self._parms["min_child_weight"] = min_child_weight
@property
def learn_rate(self):
"""
(same as eta) Learning rate (from 0.0 to 1.0)
Type: ``float``, defaults to ``0.3``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(ntrees=10000,
... learn_rate=0.01,
... stopping_rounds=5,
... stopping_metric="AUC",
... stopping_tolerance=1e-4,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("learn_rate")
@learn_rate.setter
def learn_rate(self, learn_rate):
assert_is_type(learn_rate, None, numeric)
self._parms["learn_rate"] = learn_rate
@property
def eta(self):
"""
(same as learn_rate) Learning rate (from 0.0 to 1.0)
Type: ``float``, defaults to ``0.3``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(ntrees=10000,
... learn_rate=0.01,
... stopping_rounds=5,
... stopping_metric="AUC",
... stopping_tolerance=1e-4,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("eta")
@eta.setter
def eta(self, eta):
assert_is_type(eta, None, numeric)
self._parms["eta"] = eta
@property
def sample_rate(self):
"""
(same as subsample) Row sample rate per tree (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(sample_rate=.7,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("sample_rate")
@sample_rate.setter
def sample_rate(self, sample_rate):
assert_is_type(sample_rate, None, numeric)
self._parms["sample_rate"] = sample_rate
@property
def subsample(self):
"""
(same as sample_rate) Row sample rate per tree (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(sample_rate=.7,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("subsample")
@subsample.setter
def subsample(self, subsample):
assert_is_type(subsample, None, numeric)
self._parms["subsample"] = subsample
@property
def col_sample_rate(self):
"""
(same as colsample_bylevel) Column sample rate (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate=.7,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("col_sample_rate")
@col_sample_rate.setter
def col_sample_rate(self, col_sample_rate):
assert_is_type(col_sample_rate, None, numeric)
self._parms["col_sample_rate"] = col_sample_rate
@property
def colsample_bylevel(self):
"""
(same as col_sample_rate) Column sample rate (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate=.7,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("colsample_bylevel")
@colsample_bylevel.setter
def colsample_bylevel(self, colsample_bylevel):
assert_is_type(colsample_bylevel, None, numeric)
self._parms["colsample_bylevel"] = colsample_bylevel
@property
def col_sample_rate_per_tree(self):
"""
(same as colsample_bytree) Column sample rate per tree (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate_per_tree=.7,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("col_sample_rate_per_tree")
@col_sample_rate_per_tree.setter
def col_sample_rate_per_tree(self, col_sample_rate_per_tree):
assert_is_type(col_sample_rate_per_tree, None, numeric)
self._parms["col_sample_rate_per_tree"] = col_sample_rate_per_tree
@property
def colsample_bytree(self):
"""
(same as col_sample_rate_per_tree) Column sample rate per tree (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate_per_tree=.7,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("colsample_bytree")
@colsample_bytree.setter
def colsample_bytree(self, colsample_bytree):
assert_is_type(colsample_bytree, None, numeric)
self._parms["colsample_bytree"] = colsample_bytree
@property
def colsample_bynode(self):
"""
Column sample rate per tree node (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(colsample_bynode=.5,
... seed=1234)
>>> airlines_xgb.train(x=predictors, y=response,
... training_frame=train, validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("colsample_bynode")
@colsample_bynode.setter
def colsample_bynode(self, colsample_bynode):
assert_is_type(colsample_bynode, None, numeric)
self._parms["colsample_bynode"] = colsample_bynode
@property
def max_abs_leafnode_pred(self):
"""
(same as max_delta_step) Maximum absolute value of a leaf node prediction
Type: ``float``, defaults to ``0.0``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
... seed=1234)
>>> cov_xgb = H2OXGBoostEstimator(max_abs_leafnode_pred=float(2),
... seed=1234)
>>> cov_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cov_xgb.logloss(valid=True))
"""
return self._parms.get("max_abs_leafnode_pred")
@max_abs_leafnode_pred.setter
def max_abs_leafnode_pred(self, max_abs_leafnode_pred):
assert_is_type(max_abs_leafnode_pred, None, float)
self._parms["max_abs_leafnode_pred"] = max_abs_leafnode_pred
@property
def max_delta_step(self):
"""
(same as max_abs_leafnode_pred) Maximum absolute value of a leaf node prediction
Type: ``float``, defaults to ``0.0``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
... seed=1234)
>>> cov_xgb = H2OXGBoostEstimator(max_delta_step=float(2),
... seed=1234)
>>> cov_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cov_xgb.logloss(valid=True))
"""
return self._parms.get("max_delta_step")
@max_delta_step.setter
def max_delta_step(self, max_delta_step):
assert_is_type(max_delta_step, None, float)
self._parms["max_delta_step"] = max_delta_step
@property
def monotone_constraints(self):
"""
A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a
decreasing constraint.
Type: ``dict``.
:examples:
>>> prostate_hex = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate_hex["CAPSULE"] = prostate_hex["CAPSULE"].asfactor()
>>> response = "CAPSULE"
>>> seed=42
>>> monotone_constraints={"AGE":1}
>>> xgb_model = H2OXGBoostEstimator(seed=seed,
... monotone_constraints=monotone_constraints)
>>> xgb_model.train(y=response,
... ignored_columns=["ID"],
... training_frame=prostate_hex)
>>> xgb_model.scoring_history()
"""
return self._parms.get("monotone_constraints")
@monotone_constraints.setter
def monotone_constraints(self, monotone_constraints):
assert_is_type(monotone_constraints, None, dict)
self._parms["monotone_constraints"] = monotone_constraints
@property
def interaction_constraints(self):
"""
A set of allowed column interactions.
Type: ``List[List[str]]``.
"""
return self._parms.get("interaction_constraints")
@interaction_constraints.setter
def interaction_constraints(self, interaction_constraints):
assert_is_type(interaction_constraints, None, [[str]])
self._parms["interaction_constraints"] = interaction_constraints
@property
def score_tree_interval(self):
"""
Score the model after every so many trees. Disabled if set to 0.
Type: ``int``, defaults to ``0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(score_tree_interval=5,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_xgb.scoring_history()
"""
return self._parms.get("score_tree_interval")
@score_tree_interval.setter
def score_tree_interval(self, score_tree_interval):
assert_is_type(score_tree_interval, None, int)
self._parms["score_tree_interval"] = score_tree_interval
@property
def min_split_improvement(self):
"""
(same as gamma) Minimum relative improvement in squared error reduction for a split to happen
Type: ``float``, defaults to ``0.0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(min_split_improvement=0.55,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("min_split_improvement")
@min_split_improvement.setter
def min_split_improvement(self, min_split_improvement):
assert_is_type(min_split_improvement, None, float)
self._parms["min_split_improvement"] = min_split_improvement
@property
def gamma(self):
"""
(same as min_split_improvement) Minimum relative improvement in squared error reduction for a split to happen
Type: ``float``, defaults to ``0.0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(min_split_improvement=1e-3,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("gamma")
@gamma.setter
def gamma(self, gamma):
assert_is_type(gamma, None, float)
self._parms["gamma"] = gamma
@property
def nthread(self):
"""
Number of parallel threads that can be used to run XGBoost. Cannot exceed H2O cluster limits (-nthreads
parameter). Defaults to maximum available
Type: ``int``, defaults to ``-1``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> thread = 4
>>> titanic_xgb = H2OXGBoostEstimator(nthread=thread,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=titanic)
>>> print(titanic_xgb.auc(train=True))
"""
return self._parms.get("nthread")
@nthread.setter
def nthread(self, nthread):
assert_is_type(nthread, None, int)
self._parms["nthread"] = nthread
@property
def save_matrix_directory(self):
"""
Directory where to save matrices passed to XGBoost library. Useful for debugging.
Type: ``str``.
"""
return self._parms.get("save_matrix_directory")
@save_matrix_directory.setter
def save_matrix_directory(self, save_matrix_directory):
assert_is_type(save_matrix_directory, None, str)
self._parms["save_matrix_directory"] = save_matrix_directory
@property
def build_tree_one_node(self):
"""
Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.
Type: ``bool``, defaults to ``False``.
"""
return self._parms.get("build_tree_one_node")
@build_tree_one_node.setter
def build_tree_one_node(self, build_tree_one_node):
assert_is_type(build_tree_one_node, None, bool)
self._parms["build_tree_one_node"] = build_tree_one_node
@property
def parallelize_cross_validation(self):
"""
Allow parallel training of cross-validation models
Type: ``bool``, defaults to ``True``.
"""
return self._parms.get("parallelize_cross_validation")
@parallelize_cross_validation.setter
def parallelize_cross_validation(self, parallelize_cross_validation):
assert_is_type(parallelize_cross_validation, None, bool)
self._parms["parallelize_cross_validation"] = parallelize_cross_validation
@property
def calibrate_model(self):
"""
Use Platt Scaling (default) or Isotonic Regression to calculate calibrated class probabilities. Calibration can
provide more accurate estimates of class probabilities.
Type: ``bool``, defaults to ``False``.
"""
return self._parms.get("calibrate_model")
@calibrate_model.setter
def calibrate_model(self, calibrate_model):
assert_is_type(calibrate_model, None, bool)
self._parms["calibrate_model"] = calibrate_model
@property
def calibration_frame(self):
"""
Data for model calibration
Type: ``Union[None, str, H2OFrame]``.
"""
return self._parms.get("calibration_frame")
@calibration_frame.setter
def calibration_frame(self, calibration_frame):
self._parms["calibration_frame"] = H2OFrame._validate(calibration_frame, 'calibration_frame')
@property
def calibration_method(self):
"""
Calibration method to use
Type: ``Literal["auto", "platt_scaling", "isotonic_regression"]``, defaults to ``"auto"``.
"""
return self._parms.get("calibration_method")
@calibration_method.setter
def calibration_method(self, calibration_method):
assert_is_type(calibration_method, None, Enum("auto", "platt_scaling", "isotonic_regression"))
self._parms["calibration_method"] = calibration_method
@property
def max_bins(self):
"""
For tree_method=hist only: maximum number of bins
Type: ``int``, defaults to ``256``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
... seed=1234)
>>> cov_xgb = H2OXGBoostEstimator(max_bins=200,
... seed=1234)
>>> cov_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cov_xgb.logloss(valid=True))
"""
return self._parms.get("max_bins")
@max_bins.setter
def max_bins(self, max_bins):
assert_is_type(max_bins, None, int)
self._parms["max_bins"] = max_bins
@property
def max_leaves(self):
"""
For tree_method=hist only: maximum number of leaves
Type: ``int``, defaults to ``0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(max_leaves=0, seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("max_leaves")
@max_leaves.setter
def max_leaves(self, max_leaves):
assert_is_type(max_leaves, None, int)
self._parms["max_leaves"] = max_leaves
@property
def sample_type(self):
"""
For booster=dart only: sample_type
Type: ``Literal["uniform", "weighted"]``, defaults to ``"uniform"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(sample_type="weighted",
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("sample_type")
@sample_type.setter
def sample_type(self, sample_type):
assert_is_type(sample_type, None, Enum("uniform", "weighted"))
self._parms["sample_type"] = sample_type
@property
def normalize_type(self):
"""
For booster=dart only: normalize_type
Type: ``Literal["tree", "forest"]``, defaults to ``"tree"``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(booster='dart',
... normalize_type="tree",
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("normalize_type")
@normalize_type.setter
def normalize_type(self, normalize_type):
assert_is_type(normalize_type, None, Enum("tree", "forest"))
self._parms["normalize_type"] = normalize_type
@property
def rate_drop(self):
"""
For booster=dart only: rate_drop (0..1)
Type: ``float``, defaults to ``0.0``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(rate_drop=0.1, seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("rate_drop")
@rate_drop.setter
def rate_drop(self, rate_drop):
assert_is_type(rate_drop, None, float)
self._parms["rate_drop"] = rate_drop
@property
def one_drop(self):
"""
For booster=dart only: one_drop
Type: ``bool``, defaults to ``False``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(booster='dart',
... one_drop=True,
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("one_drop")
@one_drop.setter
def one_drop(self, one_drop):
assert_is_type(one_drop, None, bool)
self._parms["one_drop"] = one_drop
@property
def skip_drop(self):
"""
For booster=dart only: skip_drop (0..1)
Type: ``float``, defaults to ``0.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(skip_drop=0.5,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train)
>>> airlines_xgb.auc(train=True)
"""
return self._parms.get("skip_drop")
@skip_drop.setter
def skip_drop(self, skip_drop):
assert_is_type(skip_drop, None, float)
self._parms["skip_drop"] = skip_drop
@property
def tree_method(self):
"""
Tree method
Type: ``Literal["auto", "exact", "approx", "hist"]``, defaults to ``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> >>> airlines_xgb = H2OXGBoostEstimator(seed=1234,
... tree_method="approx")
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("tree_method")
@tree_method.setter
def tree_method(self, tree_method):
assert_is_type(tree_method, None, Enum("auto", "exact", "approx", "hist"))
self._parms["tree_method"] = tree_method
@property
def grow_policy(self):
"""
Grow policy - depthwise is standard GBM, lossguide is LightGBM
Type: ``Literal["depthwise", "lossguide"]``, defaults to ``"depthwise"``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> titanic["const_1"] = 6
>>> titanic["const_2"] = 7
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234,
... grow_policy="depthwise")
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> titanic_xgb.auc(valid=True)
"""
return self._parms.get("grow_policy")
@grow_policy.setter
def grow_policy(self, grow_policy):
assert_is_type(grow_policy, None, Enum("depthwise", "lossguide"))
self._parms["grow_policy"] = grow_policy
@property
def booster(self):
"""
Booster type
Type: ``Literal["gbtree", "gblinear", "dart"]``, defaults to ``"gbtree"``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(booster='dart',
... normalize_type="tree",
... seed=1234)
>>> titanic_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
"""
return self._parms.get("booster")
@booster.setter
def booster(self, booster):
assert_is_type(booster, None, Enum("gbtree", "gblinear", "dart"))
self._parms["booster"] = booster
@property
def reg_lambda(self):
"""
L2 regularization
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8])
>>> airlines_xgb = H2OXGBoostEstimator(reg_lambda=.0001,
... seed=1234)
>>> airlines_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
"""
return self._parms.get("reg_lambda")
@reg_lambda.setter
def reg_lambda(self, reg_lambda):
assert_is_type(reg_lambda, None, float)
self._parms["reg_lambda"] = reg_lambda
@property
def reg_alpha(self):
"""
L1 regularization
Type: ``float``, defaults to ``0.0``.
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_xgb = H2OXGBoostEstimator(reg_alpha=.25)
>>> boston_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(boston_xgb.mse(valid=True))
"""
return self._parms.get("reg_alpha")
@reg_alpha.setter
def reg_alpha(self, reg_alpha):
assert_is_type(reg_alpha, None, float)
self._parms["reg_alpha"] = reg_alpha
@property
def dmatrix_type(self):
"""
Type of DMatrix. For sparse, NAs and 0 are treated equally.
Type: ``Literal["auto", "dense", "sparse"]``, defaults to ``"auto"``.
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_xgb = H2OXGBoostEstimator(dmatrix_type="auto",
... seed=1234)
>>> boston_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> boston_xgb.mse()
"""
return self._parms.get("dmatrix_type")
@dmatrix_type.setter
def dmatrix_type(self, dmatrix_type):
assert_is_type(dmatrix_type, None, Enum("auto", "dense", "sparse"))
self._parms["dmatrix_type"] = dmatrix_type
@property
def backend(self):
"""
Backend. By default (auto), a GPU is used if available.
Type: ``Literal["auto", "gpu", "cpu"]``, defaults to ``"auto"``.
:examples:
>>> pros = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> pros["CAPSULE"] = pros["CAPSULE"].asfactor()
>>> pros_xgb = H2OXGBoostEstimator(tree_method="exact",
... seed=123,
... backend="cpu")
>>> pros_xgb.train(y="CAPSULE",
... ignored_columns=["ID"],
... training_frame=pros)
>>> pros_xgb.auc()
"""
return self._parms.get("backend")
@backend.setter
def backend(self, backend):
assert_is_type(backend, None, Enum("auto", "gpu", "cpu"))
self._parms["backend"] = backend
@property
def gpu_id(self):
"""
Which GPU(s) to use.
Type: ``List[int]``.
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_xgb = H2OXGBoostEstimator(gpu_id=0,
... seed=1234)
>>> boston_xgb.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> boston_xgb.mse()
"""
return self._parms.get("gpu_id")
@gpu_id.setter
def gpu_id(self, gpu_id):
assert_is_type(gpu_id, None, int, [int])
self._parms["gpu_id"] = gpu_id
@property
def gainslift_bins(self):
"""
Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning.
Type: ``int``, defaults to ``-1``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> model = H2OXGBoostEstimator(ntrees=1, gainslift_bins=20)
>>> model.train(x=["Origin", "Distance"],
... y="IsDepDelayed",
... training_frame=airlines)
>>> model.gains_lift()
"""
return self._parms.get("gainslift_bins")
@gainslift_bins.setter
def gainslift_bins(self, gainslift_bins):
assert_is_type(gainslift_bins, None, int)
self._parms["gainslift_bins"] = gainslift_bins
@property
def auc_type(self):
"""
Set default multinomial AUC type.
Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to
``"auto"``.
"""
return self._parms.get("auc_type")
@auc_type.setter
def auc_type(self, auc_type):
assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"))
self._parms["auc_type"] = auc_type
@property
def scale_pos_weight(self):
"""
Controls the effect of observations with positive labels in relation to the observations with negative labels on
gradient calculation. Useful for imbalanced problems.
Type: ``float``, defaults to ``1.0``.
"""
return self._parms.get("scale_pos_weight")
@scale_pos_weight.setter
def scale_pos_weight(self, scale_pos_weight):
assert_is_type(scale_pos_weight, None, float)
self._parms["scale_pos_weight"] = scale_pos_weight
@property
def eval_metric(self):
"""
Specification of evaluation metric that will be passed to the native XGBoost backend.
Type: ``str``.
"""
return self._parms.get("eval_metric")
@eval_metric.setter
def eval_metric(self, eval_metric):
assert_is_type(eval_metric, None, str)
self._parms["eval_metric"] = eval_metric
@property
def score_eval_metric_only(self):
"""
If enabled, score only the evaluation metric. This can make model training faster if scoring is frequent (eg.
each iteration).
Type: ``bool``, defaults to ``False``.
"""
return self._parms.get("score_eval_metric_only")
@score_eval_metric_only.setter
def score_eval_metric_only(self, score_eval_metric_only):
assert_is_type(score_eval_metric_only, None, bool)
self._parms["score_eval_metric_only"] = score_eval_metric_only
[docs] @staticmethod
def available():
"""
Ask the H2O server whether a XGBoost model can be built (depends on availability of native backends).
:return: True if a XGBoost model can be built, or False otherwise.
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_xgb = H2OXGBoostEstimator(seed=1234)
>>> boston_xgb.available()
"""
if "XGBoost" not in h2o.cluster().list_core_extensions():
print("Cannot build an XGBoost model - no backend found.")
return False
else:
return True
[docs] def convert_H2OXGBoostParams_2_XGBoostParams(self):
"""
In order to use convert_H2OXGBoostParams_2_XGBoostParams and convert_H2OFrame_2_DMatrix, you must import
the following toolboxes: xgboost, pandas, numpy and scipy.sparse.
Given an H2OXGBoost model, this method will generate the corresponding parameters that should be used by
native XGBoost in order to give exactly the same result, assuming that the same dataset
(derived from h2oFrame) is used to train the native XGBoost model.
Follow the steps below to compare H2OXGBoost and native XGBoost:
1. Train the H2OXGBoost model with H2OFrame trainFile and generate a prediction:
- h2oModelD = H2OXGBoostEstimator(\*\*h2oParamsD) # parameters specified as a dict()
- h2oModelD.train(x=myX, y=y, training_frame=trainFile) # train with H2OFrame trainFile
- h2oPredict = h2oPredictD = h2oModelD.predict(trainFile)
2. Derive the DMatrix from H2OFrame:
- nativeDMatrix = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD)
3. Derive the parameters for native XGBoost:
- nativeParams = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams()
4. Train your native XGBoost model and generate a prediction:
- nativeModel = xgb.train(params=nativeParams[0], dtrain=nativeDMatrix, num_boost_round=nativeParams[1])
- nativePredict = nativeModel.predict(data=nativeDMatrix, ntree_limit=nativeParams[1]
5. Compare the predictions h2oPredict from H2OXGBoost, nativePredict from native XGBoost.
:return: nativeParams, num_boost_round
"""
import xgboost as xgb
nativeParams = self._model_json["output"]["native_parameters"]
nativeXGBoostParams = dict()
for (a,keyname,keyvalue) in nativeParams.cell_values:
nativeXGBoostParams[keyname]=keyvalue
paramsSet = self.full_parameters
return nativeXGBoostParams, paramsSet['ntrees']['actual_value']