Source code for h2o.estimators.xgboost

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals

import h2o
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OXGBoostEstimator(H2OEstimator): """ XGBoost Builds an eXtreme Gradient Boosting model using the native XGBoost backend. """ algo = "xgboost" param_names = {"model_id", "training_frame", "validation_frame", "nfolds", "keep_cross_validation_models", "keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "score_each_iteration", "fold_assignment", "fold_column", "response_column", "ignored_columns", "ignore_const_cols", "offset_column", "weights_column", "stopping_rounds", "stopping_metric", "stopping_tolerance", "max_runtime_secs", "seed", "distribution", "tweedie_power", "categorical_encoding", "quiet_mode", "checkpoint", "export_checkpoints_dir", "ntrees", "max_depth", "min_rows", "min_child_weight", "learn_rate", "eta", "sample_rate", "subsample", "col_sample_rate", "colsample_bylevel", "col_sample_rate_per_tree", "colsample_bytree", "colsample_bynode", "max_abs_leafnode_pred", "max_delta_step", "monotone_constraints", "score_tree_interval", "min_split_improvement", "gamma", "nthread", "save_matrix_directory", "build_tree_one_node", "calibrate_model", "calibration_frame", "max_bins", "max_leaves", "sample_type", "normalize_type", "rate_drop", "one_drop", "skip_drop", "tree_method", "grow_policy", "booster", "reg_lambda", "reg_alpha", "dmatrix_type", "backend", "gpu_id", "gainslift_bins"} def __init__(self, **kwargs): super(H2OXGBoostEstimator, self).__init__() self._parms = {} for pname, pvalue in kwargs.items(): if pname == 'model_id': self._id = pvalue self._parms["model_id"] = pvalue elif pname in self.param_names: # Using setattr(...) will invoke type-checking of the arguments setattr(self, pname, pvalue) else: raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue)) @property def training_frame(self): """ Id of the training data frame. Type: ``H2OFrame``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.auc(valid=True) """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def validation_frame(self): """ Id of the validation data frame. Type: ``H2OFrame``. :examples: >>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv") >>> insurance['Group'] = insurance['Group'].asfactor() >>> insurance['Age'] = insurance['Age'].asfactor() >>> predictors = insurance.columns[0:4] >>> response = 'Claims' >>> train, valid = insurance.split_frame(ratios=[.8], ... seed=1234) >>> insurance_xgb = H2OXGBoostEstimator(seed=1234) >>> insurance_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(insurance_xgb.mse(valid=True)) """ return self._parms.get("validation_frame") @validation_frame.setter def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') @property def nfolds(self): """ Number of folds for K-fold cross-validation (0 to disable or >= 2). Type: ``int`` (default: ``0``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> folds = 5 >>> titanic_xgb = H2OXGBoostEstimator(nfolds=folds, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_xgb.auc(xval=True) """ return self._parms.get("nfolds") @nfolds.setter def nfolds(self, nfolds): assert_is_type(nfolds, None, int) self._parms["nfolds"] = nfolds @property def keep_cross_validation_models(self): """ Whether to keep the cross-validation models. Type: ``bool`` (default: ``True``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_models=True, ... nfolds=5 , ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> titanic_xgb.cross_validation_models() """ return self._parms.get("keep_cross_validation_models") @keep_cross_validation_models.setter def keep_cross_validation_models(self, keep_cross_validation_models): assert_is_type(keep_cross_validation_models, None, bool) self._parms["keep_cross_validation_models"] = keep_cross_validation_models @property def keep_cross_validation_predictions(self): """ Whether to keep the predictions of the cross-validation models. Type: ``bool`` (default: ``False``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_predictions=True, ... nfolds=5, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> titanic_xgb.cross_validation_predictions() """ return self._parms.get("keep_cross_validation_predictions") @keep_cross_validation_predictions.setter def keep_cross_validation_predictions(self, keep_cross_validation_predictions): assert_is_type(keep_cross_validation_predictions, None, bool) self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions @property def keep_cross_validation_fold_assignment(self): """ Whether to keep the cross-validation fold assignment. Type: ``bool`` (default: ``False``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_fold_assignment=True, ... nfolds=5, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> titanic_xgb.cross_validation_fold_assignment() """ return self._parms.get("keep_cross_validation_fold_assignment") @keep_cross_validation_fold_assignment.setter def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment): assert_is_type(keep_cross_validation_fold_assignment, None, bool) self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment @property def score_each_iteration(self): """ Whether to score during each iteration of model training. Type: ``bool`` (default: ``False``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(score_each_iteration=True, ... ntrees=55, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.scoring_history() """ return self._parms.get("score_each_iteration") @score_each_iteration.setter def score_each_iteration(self, score_each_iteration): assert_is_type(score_each_iteration, None, bool) self._parms["score_each_iteration"] = score_each_iteration @property def fold_assignment(self): """ Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. One of: ``"auto"``, ``"random"``, ``"modulo"``, ``"stratified"`` (default: ``"auto"``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> assignment_type = "Random" >>> titanic_xgb = H2OXGBoostEstimator(fold_assignment=assignment_type, ... nfolds=5, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_xgb.auc(xval=True) """ return self._parms.get("fold_assignment") @fold_assignment.setter def fold_assignment(self, fold_assignment): assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified")) self._parms["fold_assignment"] = fold_assignment @property def fold_column(self): """ Column with cross-validation fold index assignment per observation. Type: ``str``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> fold_numbers = titanic.kfold_column(n_folds=5, ... seed=1234) >>> fold_numbers.set_names(["fold_numbers"]) >>> titanic = titanic.cbind(fold_numbers) >>> print(titanic['fold_numbers']) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=titanic, ... fold_column="fold_numbers") >>> titanic_xgb.auc(xval=True) """ return self._parms.get("fold_column") @fold_column.setter def fold_column(self, fold_column): assert_is_type(fold_column, None, str) self._parms["fold_column"] = fold_column @property def response_column(self): """ Response variable column. Type: ``str``. """ return self._parms.get("response_column") @response_column.setter def response_column(self, response_column): assert_is_type(response_column, None, str) self._parms["response_column"] = response_column @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def ignore_const_cols(self): """ Ignore constant columns. Type: ``bool`` (default: ``True``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> titanic["const_1"] = 6 >>> titanic["const_2"] = 7 >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234, ... ignore_const_cols=True) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.auc(valid=True) """ return self._parms.get("ignore_const_cols") @ignore_const_cols.setter def ignore_const_cols(self, ignore_const_cols): assert_is_type(ignore_const_cols, None, bool) self._parms["ignore_const_cols"] = ignore_const_cols @property def offset_column(self): """ Offset column. This will be added to the combination of columns before applying the link function. Type: ``str``. """ return self._parms.get("offset_column") @offset_column.setter def offset_column(self, offset_column): assert_is_type(offset_column, None, str) self._parms["offset_column"] = offset_column @property def weights_column(self): """ Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. Type: ``str``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.auc(valid=True) """ return self._parms.get("weights_column") @weights_column.setter def weights_column(self, weights_column): assert_is_type(weights_column, None, str) self._parms["weights_column"] = weights_column @property def stopping_rounds(self): """ Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Type: ``int`` (default: ``0``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.auc(valid=True) """ return self._parms.get("stopping_rounds") @stopping_rounds.setter def stopping_rounds(self, stopping_rounds): assert_is_type(stopping_rounds, None, int) self._parms["stopping_rounds"] = stopping_rounds @property def stopping_metric(self): """ Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. One of: ``"auto"``, ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``"aucpr"``, ``"lift_top_group"``, ``"misclassification"``, ``"mean_per_class_error"``, ``"custom"``, ``"custom_increasing"`` (default: ``"auto"``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.auc(valid=True) """ return self._parms.get("stopping_metric") @stopping_metric.setter def stopping_metric(self, stopping_metric): assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing")) self._parms["stopping_metric"] = stopping_metric @property def stopping_tolerance(self): """ Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Type: ``float`` (default: ``0.001``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.auc(valid=True) """ return self._parms.get("stopping_tolerance") @stopping_tolerance.setter def stopping_tolerance(self, stopping_tolerance): assert_is_type(stopping_tolerance, None, numeric) self._parms["stopping_tolerance"] = stopping_tolerance @property def max_runtime_secs(self): """ Maximum allowed runtime in seconds for model training. Use 0 to disable. Type: ``float`` (default: ``0``). :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> cov_xgb = H2OXGBoostEstimator(max_runtime_secs=10, ... ntrees=10000, ... max_depth=10, ... seed=1234) >>> cov_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cov_xgb.logloss(valid=True)) """ return self._parms.get("max_runtime_secs") @max_runtime_secs.setter def max_runtime_secs(self, max_runtime_secs): assert_is_type(max_runtime_secs, None, numeric) self._parms["max_runtime_secs"] = max_runtime_secs @property def seed(self): """ Seed for pseudo random number generator (if applicable) Type: ``int`` (default: ``-1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> xgb_w_seed_1 = H2OXGBoostEstimator(col_sample_rate=.7, ... seed=1234) >>> xgb_w_seed_1.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> xgb_w_seed_2 = H2OXGBoostEstimator(col_sample_rate = .7, ... seed = 1234) >>> xgb_w_seed_2.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('auc for the 1st model built with a seed:', ... xgb_w_seed_1.auc(valid=True)) >>> print('auc for the 2nd model built with a seed:', ... xgb_w_seed_2.auc(valid=True)) """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def distribution(self): """ Distribution function One of: ``"auto"``, ``"bernoulli"``, ``"multinomial"``, ``"gaussian"``, ``"poisson"``, ``"gamma"``, ``"tweedie"``, ``"laplace"``, ``"quantile"``, ``"huber"`` (default: ``"auto"``). :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "cylinders" >>> train, valid = cars.split_frame(ratios=[.8], ... seed=1234) >>> cars_xgb = H2OXGBoostEstimator(distribution="poisson", ... seed=1234) >>> cars_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_xgb.mse(valid=True) """ return self._parms.get("distribution") @distribution.setter def distribution(self, distribution): assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber")) self._parms["distribution"] = distribution @property def tweedie_power(self): """ Tweedie power for Tweedie regression, must be between 1 and 2. Type: ``float`` (default: ``1.5``). :examples: >>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv") >>> predictors = insurance.columns[0:4] >>> response = 'Claims' >>> insurance['Group'] = insurance['Group'].asfactor() >>> insurance['Age'] = insurance['Age'].asfactor() >>> train, valid = insurance.split_frame(ratios=[.8], ... seed=1234) >>> insurance_xgb = H2OXGBoostEstimator(distribution="tweedie", ... tweedie_power=1.2, ... seed=1234) >>> insurance_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(insurance_xgb.mse(valid=True)) """ return self._parms.get("tweedie_power") @tweedie_power.setter def tweedie_power(self, tweedie_power): assert_is_type(tweedie_power, None, numeric) self._parms["tweedie_power"] = tweedie_power @property def categorical_encoding(self): """ Encoding scheme for categorical features One of: ``"auto"``, ``"enum"``, ``"one_hot_internal"``, ``"one_hot_explicit"``, ``"binary"``, ``"eigen"``, ``"label_encoder"``, ``"sort_by_response"``, ``"enum_limited"`` (default: ``"auto"``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> encoding = "one_hot_explicit" >>> airlines_xgb = H2OXGBoostEstimator(categorical_encoding=encoding, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.auc(valid=True) """ return self._parms.get("categorical_encoding") @categorical_encoding.setter def categorical_encoding(self, categorical_encoding): assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited")) self._parms["categorical_encoding"] = categorical_encoding @property def quiet_mode(self): """ Enable quiet mode Type: ``bool`` (default: ``True``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234, quiet_mode=True) >>> titanic_xgb.train(x=predictors ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.mse(valid=True) """ return self._parms.get("quiet_mode") @quiet_mode.setter def quiet_mode(self, quiet_mode): assert_is_type(quiet_mode, None, bool) self._parms["quiet_mode"] = quiet_mode @property def checkpoint(self): """ Model checkpoint to resume training with. Type: ``str``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","year","economy_20mpg"] >>> response = "acceleration" >>> from h2o.estimators import H2OXGBoostEstimator >>> cars_xgb = H2OXGBoostEstimator(seed=1234) >>> train, valid = cars.split_frame(ratios=[.8]) >>> cars_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_xgb.mse() >>> cars_xgb_continued = H2OXGBoostEstimator(checkpoint=cars_xgb.model_id, ... ntrees=51, ... seed=1234) >>> cars_xgb_continued.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_xgb_continued.mse() """ return self._parms.get("checkpoint") @checkpoint.setter def checkpoint(self, checkpoint): assert_is_type(checkpoint, None, str, H2OEstimator) self._parms["checkpoint"] = checkpoint @property def export_checkpoints_dir(self): """ Automatically export generated models to this directory. Type: ``str``. :examples: >>> import tempfile >>> from h2o.grid.grid_search import H2OGridSearch >>> from os import listdir >>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex") >>> predictors = ["DayofMonth", "DayOfWeek"] >>> response = "IsDepDelayed" >>> hyper_parameters = {'ntrees': [5,10]} >>> search_crit = {'strategy': "RandomDiscrete", ... 'max_models': 5, ... 'seed': 1234, ... 'stopping_rounds': 3, ... 'stopping_metric': "AUTO", ... 'stopping_tolerance': 1e-2} >>> checkpoints_dir = tempfile.mkdtemp() >>> air_grid = H2OGridSearch(H2OXGBoostEstimator, ... hyper_params=hyper_parameters, ... search_criteria=search_crit) >>> air_grid.train(x=predictors, ... y=response, ... training_frame=airlines, ... distribution="bernoulli", ... learn_rate=0.1, ... max_depth=3, ... export_checkpoints_dir=checkpoints_dir) >>> len(listdir(checkpoints_dir)) """ return self._parms.get("export_checkpoints_dir") @export_checkpoints_dir.setter def export_checkpoints_dir(self, export_checkpoints_dir): assert_is_type(export_checkpoints_dir, None, str) self._parms["export_checkpoints_dir"] = export_checkpoints_dir @property def ntrees(self): """ (same as n_estimators) Number of trees. Type: ``int`` (default: ``50``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> tree_num = [20, 50, 80, 110, 140, 170, 200] >>> label = ["20", "50", "80", "110", ... "140", "170", "200"] >>> for key, num in enumerate(tree_num): # Input integer for 'num' and 'key' >>> titanic_xgb = H2OXGBoostEstimator(ntrees=num, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(label[key], 'training score', ... titanic_xgb.auc(train=True)) >>> print(label[key], 'validation score', ... titanic_xgb.auc(valid=True)) """ return self._parms.get("ntrees") @ntrees.setter def ntrees(self, ntrees): assert_is_type(ntrees, None, int) self._parms["ntrees"] = ntrees @property def max_depth(self): """ Maximum tree depth (0 for unlimited). Type: ``int`` (default: ``6``). :examples: >>> df = h2o.import_file(path = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> response = "survived" >>> df[response] = df[response].asfactor() >>> predictors = df.columns >>> del predictors[1:3] >>> train, valid, test = df.split_frame(ratios=[0.6,0.2], ... seed=1234, ... destination_frames= ... ['train.hex', ... 'valid.hex', ... 'test.hex']) >>> xgb = H2OXGBoostEstimator() >>> xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> perf = xgb.model_performance(valid) >>> print perf.auc() """ return self._parms.get("max_depth") @max_depth.setter def max_depth(self, max_depth): assert_is_type(max_depth, None, int) self._parms["max_depth"] = max_depth @property def min_rows(self): """ (same as min_child_weight) Fewest allowed (weighted) observations in a leaf. Type: ``float`` (default: ``1``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(min_rows=16, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("min_rows") @min_rows.setter def min_rows(self, min_rows): assert_is_type(min_rows, None, numeric) self._parms["min_rows"] = min_rows @property def min_child_weight(self): """ (same as min_rows) Fewest allowed (weighted) observations in a leaf. Type: ``float`` (default: ``1``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(min_child_weight=16, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("min_child_weight") @min_child_weight.setter def min_child_weight(self, min_child_weight): assert_is_type(min_child_weight, None, numeric) self._parms["min_child_weight"] = min_child_weight @property def learn_rate(self): """ (same as eta) Learning rate (from 0.0 to 1.0) Type: ``float`` (default: ``0.3``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(ntrees=10000, ... learn_rate=0.01, ... stopping_rounds=5, ... stopping_metric="AUC", ... stopping_tolerance=1e-4, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("learn_rate") @learn_rate.setter def learn_rate(self, learn_rate): assert_is_type(learn_rate, None, numeric) self._parms["learn_rate"] = learn_rate @property def eta(self): """ (same as learn_rate) Learning rate (from 0.0 to 1.0) Type: ``float`` (default: ``0.3``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(ntrees=10000, ... learn_rate=0.01, ... stopping_rounds=5, ... stopping_metric="AUC", ... stopping_tolerance=1e-4, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("eta") @eta.setter def eta(self, eta): assert_is_type(eta, None, numeric) self._parms["eta"] = eta @property def sample_rate(self): """ (same as subsample) Row sample rate per tree (from 0.0 to 1.0) Type: ``float`` (default: ``1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(sample_rate=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("sample_rate") @sample_rate.setter def sample_rate(self, sample_rate): assert_is_type(sample_rate, None, numeric) self._parms["sample_rate"] = sample_rate @property def subsample(self): """ (same as sample_rate) Row sample rate per tree (from 0.0 to 1.0) Type: ``float`` (default: ``1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(sample_rate=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("subsample") @subsample.setter def subsample(self, subsample): assert_is_type(subsample, None, numeric) self._parms["subsample"] = subsample @property def col_sample_rate(self): """ (same as colsample_bylevel) Column sample rate (from 0.0 to 1.0) Type: ``float`` (default: ``1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("col_sample_rate") @col_sample_rate.setter def col_sample_rate(self, col_sample_rate): assert_is_type(col_sample_rate, None, numeric) self._parms["col_sample_rate"] = col_sample_rate @property def colsample_bylevel(self): """ (same as col_sample_rate) Column sample rate (from 0.0 to 1.0) Type: ``float`` (default: ``1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("colsample_bylevel") @colsample_bylevel.setter def colsample_bylevel(self, colsample_bylevel): assert_is_type(colsample_bylevel, None, numeric) self._parms["colsample_bylevel"] = colsample_bylevel @property def col_sample_rate_per_tree(self): """ (same as colsample_bytree) Column sample rate per tree (from 0.0 to 1.0) Type: ``float`` (default: ``1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate_per_tree=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("col_sample_rate_per_tree") @col_sample_rate_per_tree.setter def col_sample_rate_per_tree(self, col_sample_rate_per_tree): assert_is_type(col_sample_rate_per_tree, None, numeric) self._parms["col_sample_rate_per_tree"] = col_sample_rate_per_tree @property def colsample_bytree(self): """ (same as col_sample_rate_per_tree) Column sample rate per tree (from 0.0 to 1.0) Type: ``float`` (default: ``1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate_per_tree=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("colsample_bytree") @colsample_bytree.setter def colsample_bytree(self, colsample_bytree): assert_is_type(colsample_bytree, None, numeric) self._parms["colsample_bytree"] = colsample_bytree @property def colsample_bynode(self): """ Column sample rate per tree node (from 0.0 to 1.0) Type: ``float`` (default: ``1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(colsample_bynode=.5, ... seed=1234) >>> airlines_xgb.train(x=predictors, y=response, ... training_frame=train, validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("colsample_bynode") @colsample_bynode.setter def colsample_bynode(self, colsample_bynode): assert_is_type(colsample_bynode, None, numeric) self._parms["colsample_bynode"] = colsample_bynode @property def max_abs_leafnode_pred(self): """ (same as max_delta_step) Maximum absolute value of a leaf node prediction Type: ``float`` (default: ``0``). :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> cov_xgb = H2OXGBoostEstimator(max_abs_leafnode_pred=float(2), ... seed=1234) >>> cov_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cov_xgb.logloss(valid=True)) """ return self._parms.get("max_abs_leafnode_pred") @max_abs_leafnode_pred.setter def max_abs_leafnode_pred(self, max_abs_leafnode_pred): assert_is_type(max_abs_leafnode_pred, None, float) self._parms["max_abs_leafnode_pred"] = max_abs_leafnode_pred @property def max_delta_step(self): """ (same as max_abs_leafnode_pred) Maximum absolute value of a leaf node prediction Type: ``float`` (default: ``0``). :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> cov_xgb = H2OXGBoostEstimator(max_delta_step=float(2), ... seed=1234) >>> cov_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cov_xgb.logloss(valid=True)) """ return self._parms.get("max_delta_step") @max_delta_step.setter def max_delta_step(self, max_delta_step): assert_is_type(max_delta_step, None, float) self._parms["max_delta_step"] = max_delta_step @property def monotone_constraints(self): """ A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint. Type: ``dict``. :examples: >>> prostate_hex = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate_hex["CAPSULE"] = prostate_hex["CAPSULE"].asfactor() >>> response = "CAPSULE" >>> seed=42 >>> monotone_constraints={"AGE":1} >>> xgb_model = H2OXGBoostEstimator(seed=seed, ... monotone_constraints=monotone_constraints) >>> xgb_model.train(y=response, ... ignored_columns=["ID"], ... training_frame=prostate_hex) >>> xgb_model.scoring_history() """ return self._parms.get("monotone_constraints") @monotone_constraints.setter def monotone_constraints(self, monotone_constraints): assert_is_type(monotone_constraints, None, dict) self._parms["monotone_constraints"] = monotone_constraints @property def score_tree_interval(self): """ Score the model after every so many trees. Disabled if set to 0. Type: ``int`` (default: ``0``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(score_tree_interval=5, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.scoring_history() """ return self._parms.get("score_tree_interval") @score_tree_interval.setter def score_tree_interval(self, score_tree_interval): assert_is_type(score_tree_interval, None, int) self._parms["score_tree_interval"] = score_tree_interval @property def min_split_improvement(self): """ (same as gamma) Minimum relative improvement in squared error reduction for a split to happen Type: ``float`` (default: ``0``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(min_split_improvement=0.55, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("min_split_improvement") @min_split_improvement.setter def min_split_improvement(self, min_split_improvement): assert_is_type(min_split_improvement, None, float) self._parms["min_split_improvement"] = min_split_improvement @property def gamma(self): """ (same as min_split_improvement) Minimum relative improvement in squared error reduction for a split to happen Type: ``float`` (default: ``0``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(min_split_improvement=1e-3, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("gamma") @gamma.setter def gamma(self, gamma): assert_is_type(gamma, None, float) self._parms["gamma"] = gamma @property def nthread(self): """ Number of parallel threads that can be used to run XGBoost. Cannot exceed H2O cluster limits (-nthreads parameter). Defaults to maximum available Type: ``int`` (default: ``-1``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> thread = 4 >>> titanic_xgb = H2OXGBoostEstimator(nthread=thread, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=titanic) >>> print(titanic_xgb.auc(train=True)) """ return self._parms.get("nthread") @nthread.setter def nthread(self, nthread): assert_is_type(nthread, None, int) self._parms["nthread"] = nthread @property def save_matrix_directory(self): """ Directory where to save matrices passed to XGBoost library. Useful for debugging. Type: ``str``. """ return self._parms.get("save_matrix_directory") @save_matrix_directory.setter def save_matrix_directory(self, save_matrix_directory): assert_is_type(save_matrix_directory, None, str) self._parms["save_matrix_directory"] = save_matrix_directory @property def build_tree_one_node(self): """ Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets. Type: ``bool`` (default: ``False``). """ return self._parms.get("build_tree_one_node") @build_tree_one_node.setter def build_tree_one_node(self, build_tree_one_node): assert_is_type(build_tree_one_node, None, bool) self._parms["build_tree_one_node"] = build_tree_one_node @property def calibrate_model(self): """ Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities. Type: ``bool`` (default: ``False``). """ return self._parms.get("calibrate_model") @calibrate_model.setter def calibrate_model(self, calibrate_model): assert_is_type(calibrate_model, None, bool) self._parms["calibrate_model"] = calibrate_model @property def calibration_frame(self): """ Calibration frame for Platt Scaling Type: ``H2OFrame``. """ return self._parms.get("calibration_frame") @calibration_frame.setter def calibration_frame(self, calibration_frame): self._parms["calibration_frame"] = H2OFrame._validate(calibration_frame, 'calibration_frame') @property def max_bins(self): """ For tree_method=hist only: maximum number of bins Type: ``int`` (default: ``256``). :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> cov_xgb = H2OXGBoostEstimator(max_bins=200, ... seed=1234) >>> cov_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cov_xgb.logloss(valid=True)) """ return self._parms.get("max_bins") @max_bins.setter def max_bins(self, max_bins): assert_is_type(max_bins, None, int) self._parms["max_bins"] = max_bins @property def max_leaves(self): """ For tree_method=hist only: maximum number of leaves Type: ``int`` (default: ``0``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(max_leaves=0, seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("max_leaves") @max_leaves.setter def max_leaves(self, max_leaves): assert_is_type(max_leaves, None, int) self._parms["max_leaves"] = max_leaves @property def sample_type(self): """ For booster=dart only: sample_type One of: ``"uniform"``, ``"weighted"`` (default: ``"uniform"``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"]= airlines["Year"].asfactor() >>> airlines["Month"]= airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(sample_type="weighted", ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("sample_type") @sample_type.setter def sample_type(self, sample_type): assert_is_type(sample_type, None, Enum("uniform", "weighted")) self._parms["sample_type"] = sample_type @property def normalize_type(self): """ For booster=dart only: normalize_type One of: ``"tree"``, ``"forest"`` (default: ``"tree"``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(booster='dart', ... normalize_type="tree", ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("normalize_type") @normalize_type.setter def normalize_type(self, normalize_type): assert_is_type(normalize_type, None, Enum("tree", "forest")) self._parms["normalize_type"] = normalize_type @property def rate_drop(self): """ For booster=dart only: rate_drop (0..1) Type: ``float`` (default: ``0``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(rate_drop=0.1, seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("rate_drop") @rate_drop.setter def rate_drop(self, rate_drop): assert_is_type(rate_drop, None, float) self._parms["rate_drop"] = rate_drop @property def one_drop(self): """ For booster=dart only: one_drop Type: ``bool`` (default: ``False``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(booster='dart', ... one_drop=True, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("one_drop") @one_drop.setter def one_drop(self, one_drop): assert_is_type(one_drop, None, bool) self._parms["one_drop"] = one_drop @property def skip_drop(self): """ For booster=dart only: skip_drop (0..1) Type: ``float`` (default: ``0``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(skip_drop=0.5, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> airlines_xgb.auc(train=True) """ return self._parms.get("skip_drop") @skip_drop.setter def skip_drop(self, skip_drop): assert_is_type(skip_drop, None, float) self._parms["skip_drop"] = skip_drop @property def tree_method(self): """ Tree method One of: ``"auto"``, ``"exact"``, ``"approx"``, ``"hist"`` (default: ``"auto"``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> >>> airlines_xgb = H2OXGBoostEstimator(seed=1234, ... tree_method="approx") >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("tree_method") @tree_method.setter def tree_method(self, tree_method): assert_is_type(tree_method, None, Enum("auto", "exact", "approx", "hist")) self._parms["tree_method"] = tree_method @property def grow_policy(self): """ Grow policy - depthwise is standard GBM, lossguide is LightGBM One of: ``"depthwise"``, ``"lossguide"`` (default: ``"depthwise"``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> titanic["const_1"] = 6 >>> titanic["const_2"] = 7 >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234, ... grow_policy="depthwise") >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.auc(valid=True) """ return self._parms.get("grow_policy") @grow_policy.setter def grow_policy(self, grow_policy): assert_is_type(grow_policy, None, Enum("depthwise", "lossguide")) self._parms["grow_policy"] = grow_policy @property def booster(self): """ Booster type One of: ``"gbtree"``, ``"gblinear"``, ``"dart"`` (default: ``"gbtree"``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(booster='dart', ... normalize_type="tree", ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("booster") @booster.setter def booster(self, booster): assert_is_type(booster, None, Enum("gbtree", "gblinear", "dart")) self._parms["booster"] = booster @property def reg_lambda(self): """ L2 regularization Type: ``float`` (default: ``1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8]) >>> airlines_xgb = H2OXGBoostEstimator(reg_lambda=.0001, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("reg_lambda") @reg_lambda.setter def reg_lambda(self, reg_lambda): assert_is_type(reg_lambda, None, float) self._parms["reg_lambda"] = reg_lambda @property def reg_alpha(self): """ L1 regularization Type: ``float`` (default: ``0``). :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_xgb = H2OXGBoostEstimator(reg_alpha=.25) >>> boston_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(boston_xgb.mse(valid=True)) """ return self._parms.get("reg_alpha") @reg_alpha.setter def reg_alpha(self, reg_alpha): assert_is_type(reg_alpha, None, float) self._parms["reg_alpha"] = reg_alpha @property def dmatrix_type(self): """ Type of DMatrix. For sparse, NAs and 0 are treated equally. One of: ``"auto"``, ``"dense"``, ``"sparse"`` (default: ``"auto"``). :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_xgb = H2OXGBoostEstimator(dmatrix_type="auto", ... seed=1234) >>> boston_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> boston_xgb.mse() """ return self._parms.get("dmatrix_type") @dmatrix_type.setter def dmatrix_type(self, dmatrix_type): assert_is_type(dmatrix_type, None, Enum("auto", "dense", "sparse")) self._parms["dmatrix_type"] = dmatrix_type @property def backend(self): """ Backend. By default (auto), a GPU is used if available. One of: ``"auto"``, ``"gpu"``, ``"cpu"`` (default: ``"auto"``). :examples: >>> pros = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") >>> pros["CAPSULE"] = pros["CAPSULE"].asfactor() >>> pros_xgb = H2OXGBoostEstimator(tree_method="exact", ... seed=123, ... backend="cpu") >>> pros_xgb.train(y="CAPSULE", ... ignored_columns=["ID"], ... training_frame=pros) >>> pros_xgb.auc() """ return self._parms.get("backend") @backend.setter def backend(self, backend): assert_is_type(backend, None, Enum("auto", "gpu", "cpu")) self._parms["backend"] = backend @property def gpu_id(self): """ Which GPU to use. Type: ``int`` (default: ``0``). :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_xgb = H2OXGBoostEstimator(gpu_id=0, ... seed=1234) >>> boston_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> boston_xgb.mse() """ return self._parms.get("gpu_id") @gpu_id.setter def gpu_id(self, gpu_id): assert_is_type(gpu_id, None, int) self._parms["gpu_id"] = gpu_id @property def gainslift_bins(self): """ Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning. Type: ``int`` (default: ``-1``). :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv") >>> model = H2OXGBoostEstimator(ntrees=1, gainslift_bins=20) >>> model.train(x=["Origin", "Distance"], ... y="IsDepDelayed", ... training_frame=airlines) >>> model.gains_lift() """ return self._parms.get("gainslift_bins") @gainslift_bins.setter def gainslift_bins(self, gainslift_bins): assert_is_type(gainslift_bins, None, int) self._parms["gainslift_bins"] = gainslift_bins
[docs] @staticmethod def available(): """ Ask the H2O server whether a XGBoost model can be built (depends on availability of native backends). :return: True if a XGBoost model can be built, or False otherwise. :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_xgb = H2OXGBoostEstimator(seed=1234) >>> boston_xgb.available() """ if "XGBoost" not in h2o.cluster().list_core_extensions(): print("Cannot build an XGBoost model - no backend found.") return False else: return True