Modeling In H2O

Supervised

H2OCoxProportionalHazardsEstimator

class h2o.estimators.coxph.H2OCoxProportionalHazardsEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Cox Proportional Hazards

Trains a Cox Proportional Hazards Model (CoxPH) on an H2O dataset.

property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> checkpoints_dir = tempfile.mkdtemp()
>>> coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                            stop_column="stop",
...                                            export_checkpoints_dir=checkpoints_dir)
>>> coxph.train(x=predictor,
...             y=response,
...             training_frame=heart)
>>> len(listdir(checkpoints_dir))
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property init

Coefficient starting value.

Type: float (default: 0).

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  init=2.9)
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=heart)
>>> heart_coxph.scoring_history()
property interaction_pairs

A list of pairwise (first order) column interactions.

Type: List[tuple].

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> interaction_pairs = [("start","stop")]
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  interaction_pairs=interaction_pairs)
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=heart)
>>> heart_coxph.scoring_history()
property interactions

A list of predictor column indices to interact. All pairwise combinations will be computed for the list.

Type: List[str].

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> interactions = ['start','stop']
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  interactions=interactions)
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=heart)
>>> heart_coxph.scoring_history()
property interactions_only

A list of columns that should only be used to create interactions but should not itself participate in model training.

Type: List[str].

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> interactions = ['start','stop']
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  interactions_only=interactions)
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=heart)
>>> heart_coxph.scoring_history()
property lre_min

Minimum log-relative error.

Type: float (default: 9).

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  lre_min=5)
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=heart)
>>> heart_coxph.scoring_history()
property max_iterations

Maximum number of iterations.

Type: int (default: 20).

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  max_iterations=50)
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=heart)
>>> heart_coxph.scoring_history()
property offset_column

Offset column. This will be added to the combination of columns before applying the link function.

Type: str.

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  offset_column="transplant")
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=heart)
>>> heart_coxph.scoring_history()
property response_column

Response variable column.

Type: str.

property single_node_mode

Run on a single node to reduce the effect of network overhead (for smaller datasets)

Type: bool (default: False).

property start_column

Start Time Column.

Type: str.

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> train, valid = heart.split_frame(ratios=[.8])
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop")
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> heart_coxph.scoring_history()
property stop_column

Stop Time Column.

Type: str.

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> train, valid = heart.split_frame(ratios=[.8])
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop")
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> heart_coxph.scoring_history()
property stratify_by

List of columns to use for stratification.

Type: List[str].

property ties

Method for Handling Ties.

One of: "efron", "breslow" (default: "efron").

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> train, valid = heart.split_frame(ratios=[.8])
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  ties="breslow")
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> heart_coxph.scoring_history()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> train, valid = heart.split_frame(ratios=[.8])
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop")
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> heart_coxph.scoring_history()
property use_all_factor_levels

(Internal. For development only!) Indicates whether to use all factor levels.

Type: bool (default: False).

Examples

>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> predictor = "age"
>>> response = "event"
>>> heart_coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
...                                                  stop_column="stop",
...                                                  use_all_factor_levels=True)
>>> heart_coxph.train(x=predictor,
...                   y=response,
...                   training_frame=heart)
>>> heart_coxph.scoring_history()
property weights_column

Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor.

Type: str.

H2ODeepLearningEstimator

class h2o.estimators.deeplearning.H2ODeepLearningEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Deep Learning

Build a Deep Neural Network model using CPUs Builds a feed-forward multilayer artificial neural network on an H2OFrame

Examples

>>> from h2o.estimators.deeplearning import H2ODeepLearningEstimator
>>> rows = [[1,2,3,4,0], [2,1,2,4,1], [2,1,4,2,1],
...         [0,1,2,34,1], [2,3,4,1,0]] * 50
>>> fr = h2o.H2OFrame(rows)
>>> fr[4] = fr[4].asfactor()
>>> model = H2ODeepLearningEstimator()
>>> model.train(x=range(4), y=4, training_frame=fr)
>>> model.logloss()
property activation

Activation function.

One of: "tanh", "tanh_with_dropout", "rectifier", "rectifier_with_dropout", "maxout", "maxout_with_dropout" (default: "rectifier").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> cars_dl = H2ODeepLearningEstimator(activation="tanh")
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property adaptive_rate

Adaptive learning rate.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> cars_dl = H2ODeepLearningEstimator(adaptive_rate=True)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property autoencoder

Auto-Encoder.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> cars_dl = H2ODeepLearningEstimator(autoencoder=True)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property average_activation

Average activation for sparse auto-encoder. #Experimental

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> cars_dl = H2ODeepLearningEstimator(average_activation=1.5,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property balance_classes

Balance training data class counts via over/under-sampling (for imbalanced data).

Type: bool (default: False).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_dl = H2ODeepLearningEstimator(balance_classes=True,
...                                   seed=1234)
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.mse()
property categorical_encoding

Encoding scheme for categorical features

One of: "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> encoding = "one_hot_internal"
>>> airlines_dl = H2ODeepLearningEstimator(categorical_encoding=encoding,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.mse()
property checkpoint

Model checkpoint to resume training with.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(activation="tanh",
...                                    autoencoder=True,
...                                    seed=1234,
...                                    model_id="cars_dl")
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
>>> cars_cont = H2ODeepLearningEstimator(checkpoint=cars_dl,
...                                      seed=1234)
>>> cars_cont.train(x=predictors,
...                 y=response,
...                 training_frame=train,
...                 validation_frame=valid)
>>> cars_cont.mse()
property class_sampling_factors

Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes.

Type: List[float].

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
>>> cars_dl = H2ODeepLearningEstimator(balance_classes=True,
...                                    class_sampling_factors=sample_factors,
...                                    seed=1234)
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.mse()
property classification_stop

Stopping criterion for classification error fraction on training data (-1 to disable).

Type: float (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(classification_stop=1.5,
...                                    seed=1234)
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.mse()
property col_major

#DEPRECATED Use a column major weight matrix for input layer. Can speed up forward propagation, but might slow down backpropagation.

Type: bool (default: False).

property diagnostics

Enable diagnostics for hidden layers.

Type: bool (default: True).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(diagnostics=True,
...                                    seed=1234)  
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.mse()
property distribution

Distribution function

One of: "auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(distribution="poisson",
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property elastic_averaging

Elastic averaging between compute nodes can improve distributed model convergence. #Experimental

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(elastic_averaging=True,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property elastic_averaging_moving_rate

Elastic averaging moving rate (only if elastic averaging is enabled).

Type: float (default: 0.9).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(elastic_averaging_moving_rate=.8,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property elastic_averaging_regularization

Elastic averaging regularization strength (only if elastic averaging is enabled).

Type: float (default: 0.001).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(elastic_averaging_regularization=.008,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property epochs

How many times the dataset should be iterated (streamed), can be fractional.

Type: float (default: 10).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(epochs=15,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property epsilon

Adaptive learning rate smoothing factor (to avoid divisions by zero and allow progress).

Type: float (default: 1e-08).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(epsilon=1e-6,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> checkpoints_dir = tempfile.mkdtemp()
>>> cars_dl = H2ODeepLearningEstimator(export_checkpoints_dir=checkpoints_dir,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> len(listdir(checkpoints_dir))
property export_weights_and_biases

Whether to export Neural Network weights and biases to H2O Frames.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(export_weights_and_biases=True,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property fast_mode

Enable fast mode (minor approximation in back-propagation).

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(fast_mode=False,
...                                    seed=1234)          
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property fold_assignment

Cross-validation fold assignment scheme, if fold_column is not specified. The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(fold_assignment="Random",
...                                    nfolds=5,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> print(cars['fold_numbers'])
>>> cars_dl = H2ODeepLearningEstimator(seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars,
...               fold_column="fold_numbers")
>>> cars_dl.mse()
property force_load_balance

Force extra load balancing to increase training speed for small datasets (to keep all cores busy).

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(force_load_balance=False,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property hidden

Hidden layer sizes (e.g. [100, 100]).

Type: List[int] (default: [200, 200]).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(hidden=[100,100],
...                                    seed=1234) 
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.mse()
property hidden_dropout_ratios

Hidden layer dropout ratios (can improve generalization), specify one value per hidden layer, defaults to 0.5.

Type: List[float].

Examples

>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
>>> valid = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
>>> features = list(range(0,784))
>>> target = 784
>>> train[target] = train[target].asfactor()
>>> valid[target] = valid[target].asfactor()
>>> model = H2ODeepLearningEstimator(epochs=20,
...                                  hidden=[200,200],
...                                  hidden_dropout_ratios=[0.5,0.5],
...                                  seed=1234,
...                                  activation='tanhwithdropout')
>>> model.train(x=features,
...             y=target,
...             training_frame=train,
...             validation_frame=valid)
>>> model.mse()
property huber_alpha

Desired quantile for Huber/M-regression (threshold between quadratic and linear loss, must be between 0 and 1).

Type: float (default: 0.9).

Examples

>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8], seed=1234)
>>> insurance_dl = H2ODeepLearningEstimator(distribution="huber",
...                                         huber_alpha=0.9,
...                                         seed=1234)
>>> insurance_dl.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> insurance_dl.mse()
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(seed=1234,
...                                    ignore_const_cols=True)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property initial_biases

A list of H2OFrame ids to initialize the bias vectors of this model with.

Type: List[H2OFrame].

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv")
>>> dl1 = H2ODeepLearningEstimator(hidden=[10,10],
...                                export_weights_and_biases=True)
>>> dl1.train(x=list(range(4)), y=4, training_frame=iris)
>>> p1 = dl1.model_performance(iris).logloss()
>>> ll1 = dl1.predict(iris)
>>> print(p1)
>>> w1 = dl1.weights(0)
>>> w2 = dl1.weights(1)
>>> w3 = dl1.weights(2)
>>> b1 = dl1.biases(0)
>>> b2 = dl1.biases(1)
>>> b3 = dl1.biases(2)
>>> dl2 = H2ODeepLearningEstimator(hidden=[10,10],
...                                initial_weights=[w1, w2, w3],
...                                initial_biases=[b1, b2, b3],
...                                epochs=0)
>>> dl2.train(x=list(range(4)), y=4, training_frame=iris)
>>> dl2.initial_biases
property initial_weight_distribution

Initial weight distribution.

One of: "uniform_adaptive", "uniform", "normal" (default: "uniform_adaptive").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(initial_weight_distribution="Uniform",
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property initial_weight_scale

Uniform: -value…value, Normal: stddev.

Type: float (default: 1).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(initial_weight_scale=1.5,
...                                    seed=1234) 
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property initial_weights

A list of H2OFrame ids to initialize the weight matrices of this model with.

Type: List[H2OFrame].

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv")
>>> dl1 = H2ODeepLearningEstimator(hidden=[10,10],
...                                export_weights_and_biases=True)
>>> dl1.train(x=list(range(4)), y=4, training_frame=iris)
>>> p1 = dl1.model_performance(iris).logloss()
>>> ll1 = dl1.predict(iris)
>>> print(p1)
>>> w1 = dl1.weights(0)
>>> w2 = dl1.weights(1)
>>> w3 = dl1.weights(2)
>>> b1 = dl1.biases(0)
>>> b2 = dl1.biases(1)
>>> b3 = dl1.biases(2)
>>> dl2 = H2ODeepLearningEstimator(hidden=[10,10],
...                                initial_weights=[w1, w2, w3],
...                                initial_biases=[b1, b2, b3],
...                                epochs=0)
>>> dl2.train(x=list(range(4)), y=4, training_frame=iris)
>>> dl2.initial_weights
property input_dropout_ratio

Input layer dropout ratio (can improve generalization, try 0.1 or 0.2).

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(input_dropout_ratio=0.2,
...                                    seed=1234) 
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property keep_cross_validation_fold_assignment

Whether to keep the cross-validation fold assignment.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(keep_cross_validation_fold_assignment=True,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print(cars_dl.cross_validation_fold_assignment())
property keep_cross_validation_models

Whether to keep the cross-validation models.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(keep_cross_validation_models=True,
...                                   seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print(cars_dl.cross_validation_models())
property keep_cross_validation_predictions

Whether to keep the predictions of the cross-validation models.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(keep_cross_validation_predictions=True,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train)
>>> print(cars_dl.cross_validation_predictions())
property l1

L1 regularization (can add stability and improve generalization, causes many weights to become 0).

Type: float (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> hh_imbalanced = H2ODeepLearningEstimator(l1=1e-5,
...                                          activation="Rectifier",
...                                          loss="CrossEntropy",
...                                          hidden=[200,200],
...                                          epochs=1,
...                                          balance_classes=False,
...                                          reproducible=True,
...                                          seed=1234)
>>> hh_imbalanced.train(x=list(range(54)),y=54, training_frame=covtype)
>>> hh_imbalanced.mse()
property l2

L2 regularization (can add stability and improve generalization, causes many weights to be small.

Type: float (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> hh_imbalanced = H2ODeepLearningEstimator(l2=1e-5,
...                                          activation="Rectifier",
...                                          loss="CrossEntropy",
...                                          hidden=[200,200],
...                                          epochs=1,
...                                          balance_classes=False,
...                                          reproducible=True,
...                                          seed=1234)
>>> hh_imbalanced.train(x=list(range(54)),y=54, training_frame=covtype)
>>> hh_imbalanced.mse()
property loss

Loss function.

One of: "automatic", "cross_entropy", "quadratic", "huber", "absolute", "quantile" (default: "automatic").

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> hh_imbalanced = H2ODeepLearningEstimator(l1=1e-5,
...                                          activation="Rectifier",
...                                          loss="CrossEntropy",
...                                          hidden=[200,200],
...                                          epochs=1,
...                                          balance_classes=False,
...                                          reproducible=True,
...                                          seed=1234)
>>> hh_imbalanced.train(x=list(range(54)),y=54, training_frame=covtype)
>>> hh_imbalanced.mse()
property max_after_balance_size

Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes.

Type: float (default: 5).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> max = .85
>>> cov_dl = H2ODeepLearningEstimator(balance_classes=True,
...                                   max_after_balance_size=max,
...                                   seed=1234)
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.logloss()
property max_categorical_features

Max. number of categorical features, enforced via hashing. #Experimental

Type: int (default: 2147483647).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_dl = H2ODeepLearningEstimator(balance_classes=True,
...                                   max_categorical_features=2147483647,
...                                   seed=1234)
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.logloss()
property max_confusion_matrix_size

[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs.

Type: int (default: 20).

property max_hit_ratio_k

[Deprecated] Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable).

Type: int (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_dl = H2ODeepLearningEstimator(max_hit_ratio_k=3,
...                                   seed=1234) 
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.show()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(max_runtime_secs=10,
...                                    seed=1234) 
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property max_w2

Constraint for squared sum of incoming weights per unit (e.g. for Rectifier).

Type: float (default: 3.4028235e+38).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_dl = H2ODeepLearningEstimator(activation="RectifierWithDropout",
...                                   hidden=[10,10],
...                                   epochs=10,
...                                   input_dropout_ratio=0.2,
...                                   l1=1e-5,
...                                   max_w2=10.5,
...                                   stopping_rounds=0)
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.mse()
property mini_batch_size

Mini-batch size (smaller leads to better fit, larger can speed up and generalize better).

Type: int (default: 1).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_dl = H2ODeepLearningEstimator(activation="RectifierWithDropout",
...                                   hidden=[10,10],
...                                   epochs=10,
...                                   input_dropout_ratio=0.2,
...                                   l1=1e-5,
...                                   max_w2=10.5,
...                                   stopping_rounds=0)
...                                   mini_batch_size=35
>>> cov_dl.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cov_dl.mse()
property missing_values_handling

Handling of missing values. Either MeanImputation or Skip.

One of: "mean_imputation", "skip" (default: "mean_imputation").

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> boston.insert_missing_values()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_dl = H2ODeepLearningEstimator(missing_values_handling="skip")
>>> boston_dl.train(x=predictors,
...                 y=response,
...                 training_frame=train,
...                 validation_frame=valid)
>>> boston_dl.mse()
property momentum_ramp

Number of training samples for which momentum increases.

Type: float (default: 1000000).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Year","Month","DayofMonth","DayOfWeek","CRSDepTime",
...               "CRSArrTime","UniqueCarrier","FlightNum"]
>>> response_col = "IsDepDelayed"
>>> airlines_dl = H2ODeepLearningEstimator(hidden=[200,200],
...                                        activation="Rectifier",
...                                        input_dropout_ratio=0.0,
...                                        momentum_start=0.9,
...                                        momentum_stable=0.99,
...                                        momentum_ramp=1e7,
...                                        epochs=100,
...                                        stopping_rounds=4,
...                                        train_samples_per_iteration=30000,
...                                        mini_batch_size=32,
...                                        score_duty_cycle=0.25,
...                                        score_interval=1)
>>> airlines_dl.train(x=predictors,
...                   y=response_col,
...                   training_frame=airlines)
>>> airlines_dl.mse()
property momentum_stable

Final momentum after the ramp is over (try 0.99).

Type: float (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Year","Month","DayofMonth","DayOfWeek","CRSDepTime",
...               "CRSArrTime","UniqueCarrier","FlightNum"]
>>> response_col = "IsDepDelayed"
>>> airlines_dl = H2ODeepLearningEstimator(hidden=[200,200],
...                                        activation="Rectifier",
...                                        input_dropout_ratio=0.0,
...                                        momentum_start=0.9,
...                                        momentum_stable=0.99,
...                                        momentum_ramp=1e7,
...                                        epochs=100,
...                                        stopping_rounds=4,
...                                        train_samples_per_iteration=30000,
...                                        mini_batch_size=32,
...                                        score_duty_cycle=0.25,
...                                        score_interval=1)
>>> airlines_dl.train(x=predictors,
...                   y=response_col,
...                   training_frame=airlines)
>>> airlines_dl.mse()
property momentum_start

Initial momentum at the beginning of training (try 0.5).

Type: float (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Year","Month","DayofMonth","DayOfWeek","CRSDepTime",
...               "CRSArrTime","UniqueCarrier","FlightNum"]
>>> response_col = "IsDepDelayed"
>>> airlines_dl = H2ODeepLearningEstimator(hidden=[200,200],
...                                        activation="Rectifier",
...                                        input_dropout_ratio=0.0,
...                                        momentum_start=0.9,
...                                        momentum_stable=0.99,
...                                        momentum_ramp=1e7,
...                                        epochs=100,
...                                        stopping_rounds=4,
...                                        train_samples_per_iteration=30000,
...                                        mini_batch_size=32,
...                                        score_duty_cycle=0.25,
...                                        score_interval=1)
>>> airlines_dl.train(x=predictors,
...                   y=response_col,
...                   training_frame=airlines)
>>> airlines_dl.mse()
property nesterov_accelerated_gradient

Use Nesterov accelerated gradient (recommended).

Type: bool (default: True).

Examples

>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
>>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
>>> predictors = list(range(0,784))
>>> resp = 784
>>> train[resp] = train[resp].asfactor()
>>> test[resp] = test[resp].asfactor()
>>> nclasses = train[resp].nlevels()[0]
>>> model = H2ODeepLearningEstimator(activation="RectifierWithDropout",
...                                  adaptive_rate=False,
...                                  rate=0.01,
...                                  rate_decay=0.9,
...                                  rate_annealing=1e-6,
...                                  momentum_start=0.95,
...                                  momentum_ramp=1e5,
...                                  momentum_stable=0.99,
...                                  nesterov_accelerated_gradient=False,
...                                  input_dropout_ratio=0.2,
...                                  train_samples_per_iteration=20000,
...                                  classification_stop=-1,
...                                  l1=1e-5) 
>>> model.train (x=predictors,
...              y=resp,
...              training_frame=train,
...              validation_frame=test)
>>> model.model_performance()
property nfolds

Number of folds for K-fold cross-validation (0 to disable or >= 2).

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars_dl = H2ODeepLearningEstimator(nfolds=5, seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property offset_column

Offset column. This will be added to the combination of columns before applying the link function.

Type: str.

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> boston["offset"] = boston["medv"].log()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_dl = H2ODeepLearningEstimator(offset_column="offset",
...                                      seed=1234)
>>> boston_dl.train(x=predictors,
...                 y=response,
...                 training_frame=train,
...                 validation_frame=valid)
>>> boston_dl.mse()
property overwrite_with_best_model

If enabled, override the final model with the best model found during training.

Type: bool (default: True).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> boston["offset"] = boston["medv"].log()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_dl = H2ODeepLearningEstimator(overwrite_with_best_model=True,
...                                      seed=1234)
>>> boston_dl.train(x=predictors,
...                 y=response,
...                 training_frame=train,
...                 validation_frame=valid)
>>> boston_dl.mse()
property pretrained_autoencoder

Pretrained autoencoder model to initialize this model with.

Type: str.

Examples

>>> from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
>>> resp = 784
>>> nfeatures = 20
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
>>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
>>> train[resp] = train[resp].asfactor()
>>> test[resp] = test[resp].asfactor()
>>> sid = train[0].runif(0)
>>> train_unsupervised = train[sid>=0.5]
>>> train_unsupervised.pop(resp)
>>> train_supervised = train[sid<0.5]
>>> ae_model = H2OAutoEncoderEstimator(activation="Tanh",
...                                    hidden=[nfeatures],
...                                    model_id="ae_model",
...                                    epochs=1,
...                                    ignore_const_cols=False,
...                                    reproducible=True,
...                                    seed=1234)
>>> ae_model.train(list(range(resp)), training_frame=train_unsupervised)
>>> ae_model.mse()
>>> pretrained_model = H2ODeepLearningEstimator(activation="Tanh",
...                                             hidden=[nfeatures],
...                                             epochs=1,
...                                             reproducible = True,
...                                             seed=1234,
...                                             ignore_const_cols=False,
...                                             pretrained_autoencoder="ae_model")
>>> pretrained_model.train(list(range(resp)), resp,
...                        training_frame=train_supervised,
...                        validation_frame=test)
>>> pretrained_model.mse()
property quantile_alpha

Desired quantile for Quantile regression, must be between 0 and 1.

Type: float (default: 0.5).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_dl = H2ODeepLearningEstimator(distribution="quantile",
...                                      quantile_alpha=.8,
...                                      seed=1234)
>>> boston_dl.train(x=predictors,
...                 y=response,
...                 training_frame=train,
...                 validation_frame=valid)
>>> boston_dl.mse()
property quiet_mode

Enable quiet mode for less output to standard output.

Type: bool (default: False).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_dl = H2ODeepLearningEstimator(quiet_mode=True,
...                                       seed=1234)
>>> titanic_dl.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> titanic_dl.mse()
property rate

Learning rate (higher => less stable, lower => slower convergence).

Type: float (default: 0.005).

Examples

>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
>>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
>>> predictors = list(range(0,784))
>>> resp = 784
>>> train[resp] = train[resp].asfactor()
>>> test[resp] = test[resp].asfactor()
>>> nclasses = train[resp].nlevels()[0]
>>> model = H2ODeepLearningEstimator(activation="RectifierWithDropout",
...                                  adaptive_rate=False,
...                                  rate=0.01,
...                                  rate_decay=0.9,
...                                  rate_annealing=1e-6,
...                                  momentum_start=0.95,
...                                  momentum_ramp=1e5,
...                                  momentum_stable=0.99,
...                                  nesterov_accelerated_gradient=False,
...                                  input_dropout_ratio=0.2,
...                                  train_samples_per_iteration=20000,
...                                  classification_stop=-1,
...                                  l1=1e-5)
>>> model.train (x=predictors,y=resp, training_frame=train, validation_frame=test)
>>> model.model_performance(valid=True)
property rate_annealing

Learning rate annealing: rate / (1 + rate_annealing * samples).

Type: float (default: 1e-06).

Examples

>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
>>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
>>> predictors = list(range(0,784))
>>> resp = 784
>>> train[resp] = train[resp].asfactor()
>>> test[resp] = test[resp].asfactor()
>>> nclasses = train[resp].nlevels()[0]
>>> model = H2ODeepLearningEstimator(activation="RectifierWithDropout",
...                                  adaptive_rate=False,
...                                  rate=0.01,
...                                  rate_decay=0.9,
...                                  rate_annealing=1e-6,
...                                  momentum_start=0.95,
...                                  momentum_ramp=1e5,
...                                  momentum_stable=0.99,
...                                  nesterov_accelerated_gradient=False,
...                                  input_dropout_ratio=0.2,
...                                  train_samples_per_iteration=20000,
...                                  classification_stop=-1,
...                                  l1=1e-5)
>>> model.train (x=predictors,
...              y=resp,
...              training_frame=train,
...              validation_frame=test)
>>> model.mse()
property rate_decay

Learning rate decay factor between layers (N-th layer: rate * rate_decay ^ (n - 1).

Type: float (default: 1).

Examples

>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
>>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
>>> predictors = list(range(0,784))
>>> resp = 784
>>> train[resp] = train[resp].asfactor()
>>> test[resp] = test[resp].asfactor()
>>> nclasses = train[resp].nlevels()[0]
>>> model = H2ODeepLearningEstimator(activation="RectifierWithDropout",
...                                  adaptive_rate=False,
...                                  rate=0.01,
...                                  rate_decay=0.9,
...                                  rate_annealing=1e-6,
...                                  momentum_start=0.95,
...                                  momentum_ramp=1e5,
...                                  momentum_stable=0.99,
...                                  nesterov_accelerated_gradient=False,
...                                  input_dropout_ratio=0.2,
...                                  train_samples_per_iteration=20000,
...                                  classification_stop=-1,
...                                  l1=1e-5)
>>> model.train (x=predictors,
...              y=resp,
...              training_frame=train,
...              validation_frame=test)
>>> model.model_performance()
property regression_stop

Stopping criterion for regression error (MSE) on training data (-1 to disable).

Type: float (default: 1e-06).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(regression_stop=1e-6,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property replicate_training_data

Replicate the entire training dataset onto every node for faster training on small datasets.

Type: bool (default: True).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> airlines_dl = H2ODeepLearningEstimator(replicate_training_data=False)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=airlines) 
>>> airlines_dl.auc()
property reproducible

Force reproducibility on small data (will be slow - only uses 1 thread).

Type: bool (default: False).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(reproducible=True)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property response_column

Response variable column.

Type: str.

property rho

Adaptive learning rate time decay factor (similarity to prior updates).

Type: float (default: 0.99).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars_dl = H2ODeepLearningEstimator(rho=0.9,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property score_duty_cycle

Maximum duty cycle fraction for scoring (lower: more training, higher: more scoring).

Type: float (default: 0.1).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars_dl = H2ODeepLearningEstimator(score_duty_cycle=0.2,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars_dl = H2ODeepLearningEstimator(score_each_iteration=True,
...                                    seed=1234) 
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property score_interval

Shortest time interval (in seconds) between model scoring.

Type: float (default: 5).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars_dl = H2ODeepLearningEstimator(score_interval=3,
...                                    seed=1234) 
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property score_training_samples

Number of training set samples for scoring (0 for all).

Type: int (default: 10000).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars_dl = H2ODeepLearningEstimator(score_training_samples=10000,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property score_validation_samples

Number of validation set samples for scoring (0 for all).

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(score_validation_samples=3,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property score_validation_sampling

Method used to sample validation dataset for scoring.

One of: "uniform", "stratified" (default: "uniform").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(score_validation_sampling="uniform",
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property seed

Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded.

Type: int (default: -1).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property shuffle_training_data

Enable shuffling of training data (recommended if training data is replicated and train_samples_per_iteration is close to #nodes x #rows, of if using balance_classes).

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(shuffle_training_data=True,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property single_node_mode

Run on a single node for fine-tuning of model parameters.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(single_node_mode=True,
...                                    seed=1234) 
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property sparse

Sparse data handling (more efficient for data with lots of 0 values).

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(sparse=True,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property sparsity_beta

Sparsity regularization. #Experimental

Type: float (default: 0).

Examples

>>> from h2o.estimators import H2OAutoEncoderEstimator
>>> resp = 784
>>> nfeatures = 20
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
>>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
>>> train[resp] = train[resp].asfactor()
>>> test[resp] = test[resp].asfactor()
>>> sid = train[0].runif(0)
>>> train_unsupervised = train[sid>=0.5]
>>> train_unsupervised.pop(resp)
>>> ae_model = H2OAutoEncoderEstimator(activation="Tanh",
...                                    hidden=[nfeatures],
...                                    epochs=1,
...                                    ignore_const_cols=False,
...                                    reproducible=True,
...                                    sparsity_beta=0.5,
...                                    seed=1234)
>>> ae_model.train(list(range(resp)),
...                training_frame=train_unsupervised)
>>> ae_model.mse()
property standardize

If enabled, automatically standardize the data. If disabled, the user must provide properly scaled input data.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars_dl = H2ODeepLearningEstimator(standardize=True,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_dl.auc()
property stopping_metric

Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client.

One of: "auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(stopping_metric="auc",
...                                        stopping_rounds=3,
...                                        stopping_tolerance=1e-2,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property stopping_rounds

Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)

Type: int (default: 5).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(stopping_metric="auc",
...                                        stopping_rounds=3,
...                                        stopping_tolerance=1e-2,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property stopping_tolerance

Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)

Type: float (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(stopping_metric="auc",
...                                        stopping_rounds=3,
...                                        stopping_tolerance=1e-2,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property target_ratio_comm_to_comp

Target ratio of communication overhead to computation. Only for multi-node operation and train_samples_per_iteration = -2 (auto-tuning).

Type: float (default: 0.05).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(target_ratio_comm_to_comp=0.05,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property train_samples_per_iteration

Number of training samples (globally) per MapReduce iteration. Special values are 0: one epoch, -1: all available data (e.g., replicated training data), -2: automatic.

Type: int (default: -2).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(train_samples_per_iteration=-1,
...                                        epochs=1,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator()
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property tweedie_power

Tweedie power for Tweedie regression, must be between 1 and 2.

Type: float (default: 1.5).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(tweedie_power=1.5,
...                                        seed=1234) 
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.auc()
property use_all_factor_levels

Use all factor levels of categorical variables. Otherwise, the first factor level is omitted (without loss of accuracy). Useful for variable importances and auto-enabled for autoencoder.

Type: bool (default: True).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(use_all_factor_levels=True,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.mse()
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(standardize=True,
...                                    seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()
property variable_importances

Compute variable importances for input features (Gedeon method) - can be slow for large networks.

Type: bool (default: True).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_dl = H2ODeepLearningEstimator(variable_importances=True,
...                                        seed=1234)
>>> airlines_dl.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> airlines_dl.mse()
property weights_column

Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_dl = H2ODeepLearningEstimator(seed=1234)
>>> cars_dl.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_dl.auc()

H2OGeneralizedAdditiveEstimator

class h2o.estimators.gam.H2OGeneralizedAdditiveEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Generalized Additive Model

Fits a generalized additive model, specified by a response variable, a set of predictors, and a description of the error distribution.

A subclass of ModelBase is returned. The specific subclass depends on the machine learning task at hand (if it’s binomial classification, then an H2OBinomialModel is returned, if it’s regression then a H2ORegressionModel is returned). The default print-out of the models is shown, but further GAM-specific information can be queried out of the object. Upon completion of the GAM, the resulting object has coefficients, normalized coefficients, residual/null deviance, aic, and a host of model metrics including MSE, AUC (for logistic regression), degrees of freedom, and confusion matrices.

property Lambda

DEPRECATED. Use self.lambda_ instead

property alpha

Distribution of regularization between the L1 (Lasso) and L2 (Ridge) penalties. A value of 1 for alpha represents Lasso regression, a value of 0 produces Ridge regression, and anything in between specifies the amount of mixing between the two. Default value of alpha is 0 when SOLVER = ‘L-BFGS’; 0.5 otherwise.

Type: List[float].

property balance_classes

Balance training data class counts via over/under-sampling (for imbalanced data).

Type: bool (default: False).

property beta_constraints

Beta constraints

Type: H2OFrame.

property beta_epsilon

Converge if beta changes less (using L-infinity norm) than beta esilon, ONLY applies to IRLSM solver

Type: float (default: 0.0001).

property bs

Basis function type for each gam predictors, 0 for cr

Type: List[int].

property class_sampling_factors

Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes.

Type: List[float].

property cold_start

Only applicable to multiple alpha/lambda values when calling GLM from GAM. If false, build the next model for next set of alpha/lambda values starting from the values provided by current model. If true will start GLM model from scratch.

Type: bool (default: False).

property compute_p_values

Request p-values computation, p-values work only with IRLSM solver and no regularization

Type: bool (default: False).

property custom_metric_func

Reference to custom evaluation function, format: language:keyName=funcName

Type: str.

property early_stopping

Stop early when there is no more relative improvement on train or validation (if provided)

Type: bool (default: True).

property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

property family

Family. Use binomial for classification with logistic regression, others are for regression problems.

One of: "auto", "gaussian", "binomial", "quasibinomial", "ordinal", "multinomial", "poisson", "gamma", "tweedie", "negativebinomial", "fractionalbinomial" (default: "auto").

property fold_assignment

Cross-validation fold assignment scheme, if fold_column is not specified. The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified" (default: "auto").

property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

property gam_columns

Predictor column names for gam

Type: List[str].

property gradient_epsilon

Converge if objective changes less (using L-infinity norm) than this, ONLY applies to L-BFGS solver. Default indicates: If lambda_search is set to False and lambda is equal to zero, the default value of gradient_epsilon is equal to .000001, otherwise the default value is .0001. If lambda_search is set to True, the conditional values above are 1E-8 and 1E-6 respectively.

Type: float (default: -1).

property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property interaction_pairs

A list of pairwise (first order) column interactions.

Type: List[tuple].

property interactions

A list of predictor column indices to interact. All pairwise combinations will be computed for the list.

Type: List[str].

property intercept

Include constant term in the model

Type: bool (default: True).

property keep_cross_validation_fold_assignment

Whether to keep the cross-validation fold assignment.

Type: bool (default: False).

property keep_cross_validation_models

Whether to keep the cross-validation models.

Type: bool (default: True).

property keep_cross_validation_predictions

Whether to keep the predictions of the cross-validation models.

Type: bool (default: False).

property keep_gam_cols

Save keys of model matrix

Type: bool (default: False).

property knot_ids

String arrays storing frame keys of knots. One for each gam column specified in gam_columns

Type: List[str].

property lambda_

Regularization strength

Type: List[float].

property lambda_min_ratio

Minimum lambda used in lambda search, specified as a ratio of lambda_max (the smallest lambda that drives all coefficients to zero). Default indicates: if the number of observations is greater than the number of variables, then lambda_min_ratio is set to 0.0001; if the number of observations is less than the number of variables, then lambda_min_ratio is set to 0.01.

Type: float (default: -1).

Use lambda search starting at lambda max, given lambda is then interpreted as lambda min

Type: bool (default: False).

Link function.

One of: "family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit" (default: "family_default").

property max_active_predictors

Maximum number of active predictors during computation. Use as a stopping criterion to prevent expensive model building with many predictors. Default indicates: If the IRLSM solver is used, the value of max_active_predictors is set to 5000 otherwise it is set to 100000000.

Type: int (default: -1).

property max_after_balance_size

Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes.

Type: float (default: 5).

property max_confusion_matrix_size

[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs

Type: int (default: 20).

property max_hit_ratio_k

[Deprecated] Maximum number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)

Type: int (default: 0).

property max_iterations

Maximum number of iterations

Type: int (default: -1).

property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

property missing_values_handling

Handling of missing values. Either MeanImputation, Skip or PlugValues.

One of: "mean_imputation", "skip", "plug_values" (default: "mean_imputation").

property nfolds

Number of folds for K-fold cross-validation (0 to disable or >= 2).

Type: int (default: 0).

property nlambdas

Number of lambdas to be used in a search. Default indicates: If alpha is zero, with lambda search set to True, the value of nlamdas is set to 30 (fewer lambdas are needed for ridge regression) otherwise it is set to 100.

Type: int (default: -1).

property non_negative

Restrict coefficients (not intercept) to be non-negative

Type: bool (default: False).

property num_knots

Number of knots for gam predictors

Type: List[int].

property obj_reg

Likelihood divider in objective value computation, default is 1/nobs

Type: float (default: -1).

property objective_epsilon

Converge if objective value changes less than this. Default indicates: If lambda_search is set to True the value of objective_epsilon is set to .0001. If the lambda_search is set to False and lambda is equal to zero, the value of objective_epsilon is set to .000001, for any other value of lambda the default value of objective_epsilon is set to .0001.

Type: float (default: -1).

property offset_column

Offset column. This will be added to the combination of columns before applying the link function.

Type: str.

property plug_values

Plug Values (a single row frame containing values that will be used to impute missing values of the training/validation frame, use with conjunction missing_values_handling = PlugValues)

Type: H2OFrame.

property prior

Prior probability for y==1. To be used only for logistic regression iff the data has been sampled and the mean of response does not reflect reality.

Type: float (default: -1).

property remove_collinear_columns

In case of linearly dependent columns, remove some of the dependent columns

Type: bool (default: False).

property response_column

Response variable column.

Type: str.

property scale

Smoothing parameter for gam predictors

Type: List[float].

property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

property seed

Seed for pseudo random number generator (if applicable)

Type: int (default: -1).

property solver

AUTO will set the solver based on given data and the other parameters. IRLSM is fast on on problems with small number of predictors and for lambda-search with L1 penalty, L_BFGS scales better for datasets with many columns.

One of: "auto", "irlsm", "l_bfgs", "coordinate_descent_naive", "coordinate_descent", "gradient_descent_lh", "gradient_descent_sqerr" (default: "auto").

property standardize

Standardize numeric columns to have zero mean and unit variance

Type: bool (default: False).

property stopping_metric

Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client.

One of: "auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing" (default: "auto").

property stopping_rounds

Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)

Type: int (default: 0).

property stopping_tolerance

Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)

Type: float (default: 0.001).

property theta

Theta

Type: float (default: 0).

property training_frame

Id of the training data frame.

Type: H2OFrame.

Tweedie link power

Type: float (default: 0).

property tweedie_variance_power

Tweedie variance power

Type: float (default: 0).

property validation_frame

Id of the validation data frame.

Type: H2OFrame.

property weights_column

Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor.

Type: str.

H2OGradientBoostingEstimator

class h2o.estimators.gbm.H2OGradientBoostingEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Gradient Boosting Machine

Builds gradient boosted trees on a parsed data set, for regression or classification. The default distribution function will guess the model type based on the response column type. Otherwise, the response column must be an enum for “bernoulli” or “multinomial”, and numeric for all other distributions.

property balance_classes

Balance training data class counts via over/under-sampling (for imbalanced data).

Type: bool (default: False).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
...                                        seed=1234)
>>> cov_gbm.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
property build_tree_one_node

Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(build_tree_one_node=True,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc(valid=True)
property calibrate_model

Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities.

Type: bool (default: False).

Examples

>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> response = 'Angaus'
>>> train, calib = ecology.split_frame(seed = 12354)
>>> predictors = ecology.columns[3:13]
>>> w = h2o.create_frame(binary_fraction=1,
...                      binary_ones_fraction=0.5,
...                      missing_fraction=0,
...                      rows=744, cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_gbm = H2OGradientBoostingEstimator(ntrees=10,
...                                            max_depth=5,
...                                            min_rows=10,
...                                            learn_rate=0.1,
...                                            distribution="multinomial",
...                                            weights_column="weight",
...                                            calibrate_model=True,
...                                            calibration_frame=calib)
>>> ecology_gbm.train(x=predictors,
...                   y="Angaus",
...                   training_frame=train)
>>> ecology_gbm.auc()
property calibration_frame

Calibration frame for Platt Scaling

Type: H2OFrame.

Examples

>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> response = 'Angaus'
>>> predictors = ecology.columns[3:13]
>>> train, calib = ecology.split_frame(seed=12354)
>>> w = h2o.create_frame(binary_fraction=1,
...                      binary_ones_fraction=0.5,
...                      missing_fraction=0,
...                      rows=744,cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_gbm = H2OGradientBoostingEstimator(ntrees=10,
...                                            max_depth=5,
...                                            min_rows=10,
...                                            learn_rate=0.1,
...                                            distribution="multinomial",
...                                            calibrate_model=True,
...                                            calibration_frame=calib)
>>> ecology_gbm.train(x=predictors,
...                   y="Angaus",
...                   training_frame=train,
...                   weights_column="weight")
>>> ecology_gbm.auc()
property categorical_encoding

Encoding scheme for categorical features

One of: "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(categorical_encoding="labelencoder",
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property check_constant_response

Check if response column is constant. If enabled, then an exception is thrown if the response column is a constant value.If disabled, then model will train regardless of the response column being a constant value or not.

Type: bool (default: True).

Examples

>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> train["constantCol"] = 1
>>> my_gbm = H2OGradientBoostingEstimator(check_constant_response=False)
>>> my_gbm.train(x=list(range(1,5)),
...              y="constantCol",
...              training_frame=train)
property checkpoint

Model checkpoint to resume training with.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(ntrees=1,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> print(cars_gbm.auc(valid=True))
>>> print("Number of trees built for cars_gbm model:", cars_gbm.ntrees)
>>> cars_gbm_continued = H2OGradientBoostingEstimator(checkpoint=cars_gbm.model_id,
...                                                   ntrees=50,
...                                                   seed=1234)
>>> cars_gbm_continued.train(x=predictors,
...                          y=response,
...                          training_frame=train,
...                          validation_frame=valid)
>>> cars_gbm_continued.auc(valid=True)
>>> print("Number of trees built for cars_gbm model:",cars_gbm_continued.ntrees) 
property class_sampling_factors

Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes.

Type: List[float].

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
...                                        class_sampling_factors=sample_factors,
...                                        seed=1234)
>>> cov_gbm.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
property col_sample_rate

Column sample rate (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate=.7,
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property col_sample_rate_change_per_level

Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate_change_per_level=.9,
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property col_sample_rate_per_tree

Column sample rate per tree (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate_per_tree=.7,
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property custom_distribution_func

Reference to custom distribution, format: language:keyName=funcName

Type: str.

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(ntrees=3,
...                                             max_depth=5,
...                                             distribution="bernoulli",
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame valid)
>>> from h2o.utils.distributions import CustomDistributionBernoulli
>>> custom_distribution_bernoulli = h2o.upload_custom_distribution(CustomDistributionBernoulli,
...                                                                func_name="custom_bernoulli",
...                                                                func_file="custom_bernoulli.py")
>>> airlines_gbm_custom = H2OGradientBoostingEstimator(ntrees=3,
...                                                    max_depth=5,
...                                                    distribution="custom",
...                                                    custom_distribution_func=custom_distribution_bernoulli,
...                                                    seed=1235)
>>> airlines_gbm_custom.train(x=predictors,
...                           y=response,
...                           training_frame=train,
...                           validation_frame=valid)
>>> airlines_gbm.auc()
property custom_metric_func

Reference to custom evaluation function, format: language:keyName=funcName

Type: str.

property distribution

Distribution function

One of: "auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber", "custom" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(distribution="poisson",
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.mse(valid=True)
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> response = "IsDepDelayed"
>>> hyper_parameters = {'ntrees': [5,10]}
>>> search_crit = {'strategy': "RandomDiscrete",
...                'max_models': 5,
...                'seed': 1234,
...                'stopping_rounds': 3,
...                'stopping_metric': "AUTO",
...                'stopping_tolerance': 1e-2}
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
...                          hyper_params=hyper_parameters,
...                          search_criteria=search_crit)
>>> air_grid.train(x=predictors,
...                y=response,
...                training_frame=airlines,
...                distribution="bernoulli",
...                learn_rate=0.1,
...                max_depth=3,
...                export_checkpoints_dir=checkpoints_dir)
>>> len(listdir(checkpoints_dir))
property fold_assignment

Cross-validation fold assignment scheme, if fold_column is not specified. The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> assignment_type = "Random"
>>> cars_gbm = H2OGradientBoostingEstimator(fold_assignment=assignment_type,
...                                         nfolds=5,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors, y=response, training_frame=cars)
>>> cars_gbm.auc(xval=True)
property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> fold_numbers = cars.kfold_column(n_folds=5,
...                                  seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=cars,
...                fold_column="fold_numbers")
>>> cars_gbm.auc(xval=True)
property gainslift_bins

Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning.

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=20)
>>> model.train(x=["Origin", "Distance"],
...             y="IsDepDelayed",
...             training_frame=airlines)
>>> model.gains_lift()
property histogram_type

What type of histogram to use for finding optimal split points

One of: "auto", "uniform_adaptive", "random", "quantiles_global", "round_robin" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(histogram_type="UniformAdaptive",
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property huber_alpha

Desired quantile for Huber/M-regression (threshold between quadratic and linear loss, must be between 0 and 1).

Type: float (default: 0.9).

Examples

>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8], seed=1234)
>>> insurance_gbm = H2OGradientBoostingEstimator(distribution="huber",
...                                              huber_alpha=0.9,
...                                              seed=1234)
>>> insurance_gbm.train(x=predictors,
...                     y=response,
...                     training_frame=train,
...                     validation_frame=valid)
>>> insurance_gbm.mse(valid=True)
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234,
...                                         ignore_const_cols=True)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc(valid=True)
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property keep_cross_validation_fold_assignment

Whether to keep the cross-validation fold assignment.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_fold_assignment=True,
...                                         nfolds=5,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc()
property keep_cross_validation_models

Whether to keep the cross-validation models.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_models=True,
...                                         nfolds=5,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc()
property keep_cross_validation_predictions

Whether to keep the predictions of the cross-validation models.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_predictions=True,
...                                         nfolds=5,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc()
property learn_rate

Learning rate (from 0.0 to 1.0)

Type: float (default: 0.1).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(ntrees=10000,
...                                            learn_rate=0.01,
...                                            stopping_rounds=5,
...                                            stopping_metric="AUC",
...                                            stopping_tolerance=1e-4,
...                                            seed=1234)
>>> titanic_gbm.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> titanic_gbm.auc(valid=True)
property learn_rate_annealing

Scale the learning rate by this factor after each tree (e.g., 0.99 or 0.999)

Type: float (default: 1).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(ntrees=10000,
...                                            learn_rate=0.05,
...                                            learn_rate_annealing=.9,
...                                            stopping_rounds=5,
...                                            stopping_metric="AUC",
...                                            stopping_tolerance=1e-4,
...                                            seed=1234)
>>> titanic_gbm.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> titanic_gbm.auc(valid=True)
property max_abs_leafnode_pred

Maximum absolute value of a leaf node prediction

Type: float (default: 1.797693135e+308).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_gbm = H2OGradientBoostingEstimator(max_abs_leafnode_pred=2,
...                                        seed=1234)
>>> cov_gbm.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
property max_after_balance_size

Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes.

Type: float (default: 5).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> max = .85
>>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True,
...                                        max_after_balance_size=max,
...                                        seed=1234)
>>> cov_gbm.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
property max_confusion_matrix_size

[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs

Type: int (default: 20).

property max_depth

Maximum tree depth (0 for unlimited).

Type: int (default: 5).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(ntrees=100,
...                                         max_depth=2,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc(valid=True)
property max_hit_ratio_k

[Deprecated] Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)

Type: int (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_gbm = H2OGradientBoostingEstimator(max_hit_ratio_k=3,
...                                        seed=1234)
>>> cov_gbm.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(max_runtime_secs=10,
...                                         ntrees=10000,
...                                         max_depth=10,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc(valid=True)
property min_rows

Fewest allowed (weighted) observations in a leaf.

Type: float (default: 10).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(min_rows=16,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc(valid=True)
property min_split_improvement

Minimum relative improvement in squared error reduction for a split to happen

Type: float (default: 1e-05).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(min_split_improvement=1e-3,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc(valid=True)
property monotone_constraints

A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint.

Type: dict.

Examples

>>> prostate_hex = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate_hex["CAPSULE"] = prostate_hex["CAPSULE"].asfactor()
>>> response = "CAPSULE"
>>> seed = 42
>>> monotone_constraints = {"AGE":1}
>>> gbm_model = H2OGradientBoostingEstimator(seed=seed,
...                                          monotone_constraints=monotone_constraints)
>>> gbm_model.train(y=response,
...                 ignored_columns=["ID"],
...                 training_frame=prostate_hex)
>>> gbm_model.scoring_history()
property nbins

For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point

Type: int (default: 20).

Examples

>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [16, 32, 64, 128, 256, 512]
>>> label = ["16", "32", "64", "128", "256", "512"]
>>> for key, num in enumerate(bin_num):
...     eeg_gbm = H2OGradientBoostingEstimator(nbins=num, seed=1234)
...     eeg_gbm.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
...     print(label[key], 'training score', eeg_gbm.auc(train=True)) 
...     print(label[key], 'validation score', eeg_gbm.auc(valid=True))
property nbins_cats

For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting.

Type: int (default: 1024).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
>>> label = ["8", "16", "32", "64", "128", "256", "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
...     airlines_gbm = H2OGradientBoostingEstimator(nbins_cats=num, seed=1234)
...     airlines_gbm.train(x=predictors,
...                        y=response,
...                        training_frame=train,
...                        validation_frame=valid)
...     print(label[key], 'training score', airlines_gbm.auc(train=True))
...     print(label[key], 'validation score', airlines_gbm.auc(valid=True))
property nbins_top_level

For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level

Type: int (default: 1024).

Examples

>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [32, 64, 128, 256, 512, 1024, 2048, 4096]
>>> label = ["32", "64", "128", "256", "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
...     eeg_gbm = H2OGradientBoostingEstimator(nbins_top_level=num, seed=1234)
...     eeg_gbm.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
...     print(label[key], 'training score', eeg_gbm.auc(train=True)) 
...     print(label[key], 'validation score', eeg_gbm.auc(valid=True))
property nfolds

Number of folds for K-fold cross-validation (0 to disable or >= 2).

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> cars_gbm = H2OGradientBoostingEstimator(nfolds=folds,
...                                         seed=1234
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=cars)
>>> cars_gbm.auc()
property ntrees

Number of trees.

Type: int (default: 50).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> tree_num = [20, 50, 80, 110, 140, 170, 200]
>>> label = ["20", "50", "80", "110", "140", "170", "200"]
>>> for key, num in enumerate(tree_num):
...     titanic_gbm = H2OGradientBoostingEstimator(ntrees=num,
...                                                seed=1234)
...     titanic_gbm.train(x=predictors,
...                       y=response,
...                       training_frame=train,
...                       validation_frame=valid)
...     print(label[key], 'training score', titanic_gbm.auc(train=True))
...     print(label[key], 'validation score', titanic_gbm.auc(valid=True))
property offset_column

Offset column. This will be added to the combination of columns before applying the link function.

Type: str.

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> boston["offset"] = boston["medv"].log()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_gbm = H2OGradientBoostingEstimator(offset_column="offset",
...                                           seed=1234)
>>> boston_gbm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_gbm.mse(valid=True)
property pred_noise_bandwidth

Bandwidth (sigma) of Gaussian multiplicative noise ~N(1,sigma) for tree node predictions

Type: float (default: 0).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_gbm = H2OGradientBoostingEstimator(pred_noise_bandwidth=0.1,
...                                            seed=1234)
>>> titanic_gbm.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> titanic_gbm.auc(valid = True)
property quantile_alpha

Desired quantile for Quantile regression, must be between 0 and 1.

Type: float (default: 0.5).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_gbm = H2OGradientBoostingEstimator(distribution="quantile",
...                                           quantile_alpha=.8,
...                                           seed=1234)
>>> boston_gbm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_gbm.mse(valid=True)
property r2_stopping

r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or exceeds this

Type: float (default: 1.797693135e+308).

property response_column

Response variable column.

Type: str.

property sample_rate

Row sample rate per tree (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Month"] = airlines["Month"].asfactor()                             >>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(sample_rate=.7,
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property sample_rate_per_class

A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree

Type: List[float].

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> rate_per_class_list = [1, .4, 1, 1, 1, 1, 1]
>>> cov_gbm = H2OGradientBoostingEstimator(sample_rate_per_class=rate_per_class_list,
...                                        seed=1234)
>>> cov_gbm.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cov_gbm.logloss(valid=True)
property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
...                                 seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(score_each_iteration=True,
...                                         ntrees=55,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.scoring_history()
property score_tree_interval

Score the model after every so many trees. Disabled if set to 0.

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
...                                 seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(score_tree_interval=True,
...                                         ntrees=55,
...                                         seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.scoring_history()
property seed

Seed for pseudo random number generator (if applicable)

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> gbm_w_seed_1 = H2OGradientBoostingEstimator(col_sample_rate=.7,
...                                             seed=1234)
>>> gbm_w_seed_1.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print('auc for the 1st model built with a seed:', gbm_w_seed_1.auc(valid=True))
property stopping_metric

Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client.

One of: "auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
...                                             stopping_rounds=3,
...                                             stopping_tolerance=1e-2,
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property stopping_rounds

Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)

Type: int (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
...                                             stopping_rounds=3,
...                                             stopping_tolerance=1e-2,
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property stopping_tolerance

Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)

Type: float (default: 0.001).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc",
...                                             stopping_rounds=3,
...                                             stopping_tolerance=1e-2,
...                                             seed=1234)
>>> airlines_gbm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_gbm.auc(valid=True)
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc(valid=True)
property tweedie_power

Tweedie power for Tweedie regression, must be between 1 and 2.

Type: float (default: 1.5).

Examples

>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8], seed=1234)
>>> insurance_gbm = H2OGradientBoostingEstimator(distribution="tweedie",
...                                              tweedie_power=1.2,
...                                              seed=1234)
>>> insurance_gbm.train(x=predictors,
...                     y=response,
...                     training_frame=train,
...                     validation_frame=valid)
>>> insurance_gbm.mse(valid=True)
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_gbm.auc(valid=True)
property weights_column

Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> cars_gbm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid,
...                weights_column="weight")
>>> cars_gbm.auc(valid=True)

H2OGeneralizedLinearEstimator

class h2o.estimators.glm.H2OGeneralizedLinearEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Generalized Linear Modeling

Fits a generalized linear model, specified by a response variable, a set of predictors, and a description of the error distribution.

A subclass of ModelBase is returned. The specific subclass depends on the machine learning task at hand (if it’s binomial classification, then an H2OBinomialModel is returned, if it’s regression then a H2ORegressionModel is returned). The default print-out of the models is shown, but further GLM-specific information can be queried out of the object. Upon completion of the GLM, the resulting object has coefficients, normalized coefficients, residual/null deviance, aic, and a host of model metrics including MSE, AUC (for logistic regression), degrees of freedom, and confusion matrices.

property HGLM

If set to true, will return HGLM model. Otherwise, normal GLM model will be returned

Type: bool (default: False).

property Lambda

DEPRECATED. Use self.lambda_ instead

property alpha

Distribution of regularization between the L1 (Lasso) and L2 (Ridge) penalties. A value of 1 for alpha represents Lasso regression, a value of 0 produces Ridge regression, and anything in between specifies the amount of mixing between the two. Default value of alpha is 0 when SOLVER = ‘L-BFGS’; 0.5 otherwise.

Type: List[float].

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(alpha=.25)
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> print(boston_glm.mse(valid=True))
property balance_classes

Balance training data class counts via over/under-sampling (for imbalanced data).

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","year"]
>>> response = "acceleration"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_glm = H2OGeneralizedLinearEstimator(balance_classes=True,
...                                          seed=1234)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
property beta_constraints

Beta constraints

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","year"]
>>> response = "acceleration"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> n = len(predictors)
>>> constraints = h2o.H2OFrame({'names':predictors,
...                             'lower_bounds': [-1000]*n,
...                             'upper_bounds': [1000]*n,
...                             'beta_given': [1]*n,
...                             'rho': [0.2]*n})
>>> cars_glm = H2OGeneralizedLinearEstimator(standardize=True,
...                                          beta_constraints=constraints)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
property beta_epsilon

Converge if beta changes less (using L-infinity norm) than beta esilon, ONLY applies to IRLSM solver

Type: float (default: 0.0001).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","year"]
>>> response = "acceleration"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_glm = H2OGeneralizedLinearEstimator(beta_epsilon=1e-3)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
property calc_like

if true, will return likelihood function value for HGLM.

Type: bool (default: False).

property class_sampling_factors

Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes.

Type: List[float].

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","year"]
>>> response = "acceleration"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
>>> cars_glm = H2OGeneralizedLinearEstimator(balance_classes=True,
...                                          class_sampling_factors=sample_factors,
...                                          seed=1234)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
property cold_start

Only applicable to multiple alpha/lambda values. If false, build the next model for next set of alpha/lambda values starting from the values provided by current model. If true will start GLM model from scratch.

Type: bool (default: False).

property compute_p_values

Request p-values computation, p-values work only with IRLSM solver and no regularization

Type: bool (default: False).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8])
>>> airlines_glm = H2OGeneralizedLinearEstimator(family='binomial',
...                                              lambda_=0,
...                                              remove_collinear_columns=True,
...                                              compute_p_values=True)
>>> airlines_glm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_glm.mse()
property custom_metric_func

Reference to custom evaluation function, format: language:keyName=funcName

Type: str.

property early_stopping

Stop early when there is no more relative improvement on train or validation (if provided)

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_glm = H2OGeneralizedLinearEstimator(family='binomial',
...                                          early_stopping=True)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.auc(valid=True)
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","year"]
>>> response = "acceleration"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> checkpoints = tempfile.mkdtemp()
>>> cars_glm = H2OGeneralizedLinearEstimator(export_checkpoints_dir=checkpoints,
...                                          seed=1234)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
>>> len(listdir(checkpoints_dir))
property family

Family. Use binomial for classification with logistic regression, others are for regression problems.

One of: "auto", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "ordinal", "multinomial", "poisson", "gamma", "tweedie", "negativebinomial" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_glm = H2OGeneralizedLinearEstimator(family='binomial')
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.auc(valid = True)
property fold_assignment

Cross-validation fold assignment scheme, if fold_column is not specified. The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> assignment_type = "Random"
>>> cars_gml = H2OGeneralizedLinearEstimator(fold_assignment=assignment_type,
...                                          nfolds=5,
...                                          family='binomial',
...                                          seed=1234)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=cars)
>>> cars_glm.auc(train=True)
property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> print(cars['fold_numbers'])
>>>  cars_glm = H2OGeneralizedLinearEstimator(seed=1234,
...                                           family="binomial")
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=cars,
...                fold_column="fold_numbers")
>>> cars_glm.auc(xval=True)
static getGLMRegularizationPath(model)[source]

Extract full regularization path explored during lambda search from glm model.

Parameters

model – source lambda search model

Examples

>>> d = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> m = H2OGeneralizedLinearEstimator(family = 'binomial',
...                                   lambda_search = True,
...                                   solver = 'COORDINATE_DESCENT')
>>> m.train(training_frame = d,
...         x = [2,3,4,5,6,7,8],
...         y = 1)
>>> r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(m)
>>> m2 = H2OGeneralizedLinearEstimator.makeGLMModel(model=m,
...                                                 coefs=r['coefficients'][10])
>>> dev1 = r['explained_deviance_train'][10]
>>> p = m2.model_performance(d)
>>> dev2 = 1-p.residual_deviance()/p.null_deviance()
>>> print(dev1, " =?= ", dev2)
property gradient_epsilon

Converge if objective changes less (using L-infinity norm) than this, ONLY applies to L-BFGS solver. Default indicates: If lambda_search is set to False and lambda is equal to zero, the default value of gradient_epsilon is equal to .000001, otherwise the default value is .0001. If lambda_search is set to True, the conditional values above are 1E-8 and 1E-6 respectively.

Type: float (default: -1).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(gradient_epsilon=1e-3)
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_glm.mse()
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_glm = H2OGeneralizedLinearEstimator(seed=1234,
...                                          ignore_const_cols=True,
...                                          family="binomial")
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.auc(valid=True)
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property interaction_pairs

A list of pairwise (first order) column interactions.

Type: List[tuple].

Examples

>>> df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> XY = [df.names[i-1] for i in [1,2,3,4,6,8,9,13,17,18,19,31]]
>>> interactions = [XY[i-1] for i in [5,7,9]]
>>> m = H2OGeneralizedLinearEstimator(lambda_search=True,
...                                   family="binomial",
...                                   interactions=interactions)
>>> m.train(x=XY[:len(XY)], y=XY[-1],training_frame=df)
>>> m._model_json['output']['coefficients_table']
>>> coef_m = m._model_json['output']['coefficients_table']
>>> interaction_pairs = [("CRSDepTime", "UniqueCarrier"),
...                      ("CRSDepTime", "Origin"),
...                      ("UniqueCarrier", "Origin")]
>>> mexp = H2OGeneralizedLinearEstimator(lambda_search=True,
...                                      family="binomial",
...                                      interaction_pairs=interaction_pairs)
>>> mexp.train(x=XY[:len(XY)], y=XY[-1],training_frame=df)
>>> mexp._model_json['output']['coefficients_table']
property interactions

A list of predictor column indices to interact. All pairwise combinations will be computed for the list.

Type: List[str].

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> interactions_list = ['crim', 'dis']
>>> boston_glm = H2OGeneralizedLinearEstimator(interactions=interactions_list) 
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_glm.mse()
property intercept

Include constant term in the model

Type: bool (default: True).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris['class'] = iris['class'].asfactor()
>>> predictors = iris.columns[:-1]
>>> response = 'class'
>>> train, valid = iris.split_frame(ratios=[.8])
>>> iris_glm = H2OGeneralizedLinearEstimator(family='multinomial',
...                                          intercept=True)
>>> iris_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> iris_glm.logloss(valid=True)
property keep_cross_validation_fold_assignment

Whether to keep the cross-validation fold assignment.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_glm = H2OGeneralizedLinearEstimator(keep_cross_validation_fold_assignment=True,
...                                          nfolds=5,
...                                          seed=1234,
...                                          family="binomial")
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train)
>>> cars_glm.cross_validation_fold_assignment()
property keep_cross_validation_models

Whether to keep the cross-validation models.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_glm = H2OGeneralizedLinearEstimator(keep_cross_validation_models=True,
...                                          nfolds=5,
...                                          seed=1234,
...                                          family="binomial")
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train)
>>> cars_glm_cv_models = cars_glm.cross_validation_models()
>>> print(cars_glm.cross_validation_models())
property keep_cross_validation_predictions

Whether to keep the predictions of the cross-validation models.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_glm = H2OGeneralizedLinearEstimator(keep_cross_validation_predictions=True,
...                                          nfolds=5,
...                                          seed=1234,
...                                          family="binomial")
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train)
>>> cars_glm.cross_validation_predictions()
property lambda_

Regularization strength

Type: List[float].

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8])
>>> airlines_glm = H2OGeneralizedLinearEstimator(family='binomial',
...                                              lambda_=.0001)
>>> airlines_glm.train(x=predictors,
...                    y=response
...                    trainig_frame=train,
...                    validation_frame=valid)
>>> print(airlines_glm.auc(valid=True))
property lambda_min_ratio

Minimum lambda used in lambda search, specified as a ratio of lambda_max (the smallest lambda that drives all coefficients to zero). Default indicates: if the number of observations is greater than the number of variables, then lambda_min_ratio is set to 0.0001; if the number of observations is less than the number of variables, then lambda_min_ratio is set to 0.01.

Type: float (default: -1).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(lambda_min_ratio=.0001)
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_glm.mse()

Use lambda search starting at lambda max, given lambda is then interpreted as lambda min

Type: bool (default: False).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(lambda_search=True)
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> print(boston_glm.mse(valid=True))

Link function.

One of: "family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit" (default: "family_default").

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris['class'] = iris['class'].asfactor()
>>> predictors = iris.columns[:-1]
>>> response = 'class'
>>> train, valid = iris.split_frame(ratios=[.8])
>>> iris_glm = H2OGeneralizedLinearEstimator(family='multinomial',
...                                          link='family_default')
>>> iris_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> iris_glm.logloss()
static makeGLMModel(model, coefs, threshold=0.5)[source]

Create a custom GLM model using the given coefficients.

Needs to be passed source model trained on the dataset to extract the dataset information from.

Parameters
  • model – source model, used for extracting dataset information

  • coefs – dictionary containing model coefficients

  • threshold – (optional, only for binomial) decision threshold used for classification

Examples

>>> d = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> m = H2OGeneralizedLinearEstimator(family='binomial',
...                                   lambda_search=True,
...                                   solver='COORDINATE_DESCENT')
>>> m.train(training_frame=d,
...         x=[2,3,4,5,6,7,8],
...         y=1)
>>> r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(m)
>>> m2 = H2OGeneralizedLinearEstimator.makeGLMModel(model=m,
...                                                 coefs=r['coefficients'][10])
>>> dev1 = r['explained_deviance_train'][10]
>>> p = m2.model_performance(d)
>>> dev2 = 1-p.residual_deviance()/p.null_deviance()
>>> print(dev1, " =?= ", dev2)
property max_active_predictors

Maximum number of active predictors during computation. Use as a stopping criterion to prevent expensive model building with many predictors. Default indicates: If the IRLSM solver is used, the value of max_active_predictors is set to 5000 otherwise it is set to 100000000.

Type: int (default: -1).

Examples

>>> higgs= h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/testng/higgs_train_5k.csv")
>>> predictors = higgs.names
>>> predictors.remove('response')
>>> response = "response"
>>> train, valid = higgs.split_frame(ratios=[.8])
>>> higgs_glm = H2OGeneralizedLinearEstimator(family='binomial',
...                                           max_active_predictors=200)
>>> higgs_glm.train(x=predictors,
...                 y=response,
...                 training_frame=train,
...                 validation_frame=valid)
>>> higgs_glm.auc()
property max_after_balance_size

Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes.

Type: float (default: 5).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","year"]
>>> response = "acceleration"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> max = .85
>>> cars_glm = H2OGeneralizedLinearEstimator(balance_classes=True,
...                                          max_after_balance_size=max,
...                                          seed=1234)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
property max_confusion_matrix_size

[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs

Type: int (default: 20).

property max_hit_ratio_k

[Deprecated] Maximum number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","year"]
>>> response = "acceleration"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_glm = H2OGeneralizedLinearEstimator(max_hit_ratio_k=3,
...                                          seed=1234)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
property max_iterations

Maximum number of iterations

Type: int (default: -1).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_glm = H2OGeneralizedLinearEstimator(family='binomial',
...                                          max_iterations=50)
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_glm = H2OGeneralizedLinearEstimator(max_runtime_secs=10,
...                                          seed=1234) 
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.mse()
property missing_values_handling

Handling of missing values. Either MeanImputation, Skip or PlugValues.

One of: "mean_imputation", "skip", "plug_values" (default: "mean_imputation").

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> boston.insert_missing_values()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(missing_values_handling="skip")
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_glm.mse()
property nfolds

Number of folds for K-fold cross-validation (0 to disable or >= 2).

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> cars_glm = H2OGeneralizedLinearEstimator(nfolds=folds,
...                                          seed=1234,
...                                          family='binomial')
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=cars)
>>> cars_glm.auc(xval=True)
property nlambdas

Number of lambdas to be used in a search. Default indicates: If alpha is zero, with lambda search set to True, the value of nlamdas is set to 30 (fewer lambdas are needed for ridge regression) otherwise it is set to 100.

Type: int (default: -1).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(lambda_search=True,
...                                            nlambdas=50)
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> print(boston_glm.mse(valid=True))
property non_negative

Restrict coefficients (not intercept) to be non-negative

Type: bool (default: False).

Examples

>>> airlines = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8])
>>> airlines_glm = H2OGeneralizedLinearEstimator(family='binomial',
...                                              non_negative=True)
>>> airlines_glm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_glm.auc()
property obj_reg

Likelihood divider in objective value computation, default is 1/nobs

Type: float (default: -1).

Examples

>>> df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/glm_ordinal_logit/ordinal_multinomial_training_set.csv")
>>> df["C11"] = df["C11"].asfactor()
>>> ordinal_fit = H2OGeneralizedLinearEstimator(family="ordinal",
...                                             alpha=1.0,
...                                             lambda_=0.000000001,
...                                             obj_reg=0.00001,
...                                             max_iterations=1000,
...                                             beta_epsilon=1e-8,
...                                             objective_epsilon=1e-10)
>>> ordinal_fit.train(x=list(range(0,10)),
...                   y="C11",
...                   training_frame=df)
>>> ordinal_fit.mse()
property objective_epsilon

Converge if objective value changes less than this. Default indicates: If lambda_search is set to True the value of objective_epsilon is set to .0001. If the lambda_search is set to False and lambda is equal to zero, the value of objective_epsilon is set to .000001, for any other value of lambda the default value of objective_epsilon is set to .0001.

Type: float (default: -1).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(objective_epsilon=1e-3)
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_glm.mse()
property offset_column

Offset column. This will be added to the combination of columns before applying the link function.

Type: str.

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> boston["offset"] = boston["medv"].log()
>>> train, valid = boston.split_frame(ratios=[.8], seed=1234)
>>> boston_glm = H2OGeneralizedLinearEstimator(offset_column="offset",
...                                            seed=1234)
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_glm.mse(valid=True)
property plug_values

Plug Values (a single row frame containing values that will be used to impute missing values of the training/validation frame, use with conjunction missing_values_handling = PlugValues)

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars = cars.drop(0)
>>> means = cars.mean()
>>> means = H2OFrame._expr(ExprNode("mean", cars, True, 0))
>>> glm_means = H2OGeneralizedLinearEstimator(seed=42)
>>> glm_means.train(training_frame=cars, y="cylinders")
>>> glm_plugs1 = H2OGeneralizedLinearEstimator(seed=42,
...                                            missing_values_handling="PlugValues",
...                                            plug_values=means)
>>> glm_plugs1.train(training_frame=cars, y="cylinders")
>>> glm_means.coef() == glm_plugs1.coef()
>>> not_means = 0.1 + (means * 0.5)
>>> glm_plugs2 = H2OGeneralizedLinearEstimator(seed=42,
...                                            missing_values_handling="PlugValues",
...                                            plug_values=not_means)
>>> glm_plugs2.train(training_frame=cars, y="cylinders")
>>> glm_means.coef() != glm_plugs2.coef()
property prior

Prior probability for y==1. To be used only for logistic regression iff the data has been sampled and the mean of response does not reflect reality.

Type: float (default: -1).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_glm1 = H2OGeneralizedLinearEstimator(family='binomial', prior=0.5)
>>> cars_glm1.train(x=predictors,
...                 y=response,
...                 training_frame=train,
...                 validation_frame=valid)
>>> cars_glm1.mse()
property rand_family

Random Component Family array. One for each random component. Only support gaussian for now.

Type: List[Enum["[gaussian]"]].

Link function array for random component in HGLM.

Type: List[Enum["[identity]", "[family_default]"]].

property random_columns

random columns indices for HGLM.

Type: List[int].

property remove_collinear_columns

In case of linearly dependent columns, remove some of the dependent columns

Type: bool (default: False).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8])
>>> airlines_glm = H2OGeneralizedLinearEstimator(family='binomial',
...                                              lambda_=0,
...                                              remove_collinear_columns=True)
>>> airlines_glm.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_glm.auc()
property response_column

Response variable column.

Type: str.

property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_glm = H2OGeneralizedLinearEstimator(score_each_iteration=True,
...                                          seed=1234,
...                                          family='binomial')
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.scoring_history()
property score_iteration_interval

Perform scoring for every score_iteration_interval iterations

Type: int (default: -1).

property seed

Seed for pseudo random number generator (if applicable)

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid = airlines.split_frame(ratios=[.8], seed=1234)
>>> glm_w_seed = H2OGeneralizedLinearEstimator(family='binomial',
...                                            seed=1234)
>>> glm_w_seed.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> print(glm_w_seed_1.auc(valid=True))
property solver

AUTO will set the solver based on given data and the other parameters. IRLSM is fast on on problems with small number of predictors and for lambda-search with L1 penalty, L_BFGS scales better for datasets with many columns.

One of: "auto", "irlsm", "l_bfgs", "coordinate_descent_naive", "coordinate_descent", "gradient_descent_lh", "gradient_descent_sqerr" (default: "auto").

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(solver='irlsm')
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> print(boston_glm.mse(valid=True))
property standardize

Standardize numeric columns to have zero mean and unit variance

Type: bool (default: True).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_glm = H2OGeneralizedLinearEstimator(standardize=True)
>>> boston_glm.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_glm.mse()
property startval

double array to initialize fixed and random coefficients for HGLM, coefficients for GLM.

Type: List[float].

property stopping_metric

Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client.

One of: "auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing" (default: "auto").

property stopping_rounds

Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)

Type: int (default: 0).

property stopping_tolerance

Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)

Type: float (default: 0.001).

property theta

Theta

Type: float (default: 1e-10).

Examples

>>> h2o_df = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/Motor_insurance_sweden.txt")
>>> predictors = ["Payment", "Insured", "Kilometres", "Zone", "Bonus", "Make"]
>>> response = "Claims"
>>> negativebinomial_fit = H2OGeneralizedLinearEstimator(family="negativebinomial",
...                                                      link="identity",
...                                                      theta=0.5)
>>> negativebinomial_fit.train(x=predictors,
...                            y=response,
...                            training_frame=h2o_df)
>>> negativebinomial_fit.scoring_history()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
...                                 seed=1234)
>>> cars_glm = H2OGeneralizedLinearEstimator(seed=1234,
...                                          family='binomial')
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.auc(train=True)

Tweedie link power

Type: float (default: 1).

Examples

>>> auto = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/auto.csv")
>>> predictors = auto.names
>>> predictors.remove('y')
>>> response = "y"
>>> train, valid = auto.split_frame(ratios=[.8])
>>> auto_glm = H2OGeneralizedLinearEstimator(family='tweedie',
...                                          tweedie_link_power=1)
>>> auto_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> print(auto_glm.mse(valid=True))
property tweedie_variance_power

Tweedie variance power

Type: float (default: 0).

Examples

>>> auto = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/auto.csv")
>>> predictors = auto.names
>>> predictors.remove('y')
>>> response = "y"
>>> train, valid = auto.split_frame(ratios=[.8])
>>> auto_glm = H2OGeneralizedLinearEstimator(family='tweedie',
...                                          tweedie_variance_power=1)
>>> auto_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> print(auto_glm.mse(valid=True))
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_glm = H2OGeneralizedLinearEstimator(seed=1234,
...                                          family='binomial')
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_glm.auc(valid=True)
property weights_column

Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_glm = H2OGeneralizedLinearEstimator(seed=1234,
...                                          family='binomial')
>>> cars_glm.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid,
...                weights_column="weight")
>>> cars_glm.auc(valid=True)

H2ONaiveBayesEstimator

class h2o.estimators.naive_bayes.H2ONaiveBayesEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Naive Bayes

The naive Bayes classifier assumes independence between predictor variables conditional on the response, and a Gaussian distribution of numeric predictors with mean and standard deviation computed from the training dataset. When building a naive Bayes classifier, every row in the training dataset that contains at least one NA will be skipped completely. If the test dataset has missing values, then those predictors are omitted in the probability calculation during prediction.

property balance_classes

Balance training data class counts via over/under-sampling (for imbalanced data).

Type: bool (default: False).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris_nb = H2ONaiveBayesEstimator(balance_classes=False,
...                                  nfolds=3,
...                                  seed=1234)
>>> iris_nb.train(x=list(range(4)),
...               y=4,
...               training_frame=iris)
>>> iris_nb.mse()
property class_sampling_factors

Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes.

Type: List[float].

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
>>> cov_nb = H2ONaiveBayesEstimator(class_sampling_factors=sample_factors,
...                                 seed=1234)
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> cov_nb.train(x=predictors, y=response, training_frame=covtype)
>>> cov_nb.logloss()
property compute_metrics

Compute metrics on training data

Type: bool (default: True).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
>>> prostate['RACE'] = prostate['RACE'].asfactor()
>>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
>>> prostate['DPROS'] = prostate['DPROS'].asfactor()
>>> response_col = 'CAPSULE'
>>> prostate_nb = H2ONaiveBayesEstimator(laplace=0,
...                                      compute_metrics=False)
>>> prostate_nb.train(x=list(range(3,9)),
...                   y=response_col,
...                   training_frame=prostate)
>>> prostate_nb.show()
property eps_prob

Cutoff below which probability is replaced with min_prob

Type: float (default: 0).

Examples

>>> import random
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> problem = random.sample(["binomial","multinomial"],1)
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> if problem == "binomial":
...     response_col = "economy_20mpg"
... else:
...     response_col = "cylinders"
>>> cars[response_col] = cars[response_col].asfactor()
>>> cars_nb = H2ONaiveBayesEstimator(min_prob=0.1,
...                                  eps_prob=0.5,
...                                  seed=1234)
>>> cars_nb.train(x=predictors, y=response_col, training_frame=cars)
>>> cars_nb.mse()
property eps_sdev

Cutoff below which standard deviation is replaced with min_sdev

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> problem = random.sample(["binomial","multinomial"],1)
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> if problem == "binomial":
...     response_col = "economy_20mpg"
... else:
...     response_col = "cylinders"
>>> cars[response_col] = cars[response_col].asfactor()
>>> cars_nb = H2ONaiveBayesEstimator(min_sdev=0.1,
...                                  eps_sdev=0.5,
...                                  seed=1234)
>>> cars_nb.train(x=predictors, y=response_col, training_frame=cars)
>>> cars_nb.mse()
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> response = "IsDepDelayed"
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_nb = H2ONaiveBayesEstimator(export_checkpoints_dir=checkpoints_dir)
>>> air_nb.train(x=predictors, y=response, training_frame=airlines)
>>> len(listdir(checkpoints_dir))
property fold_assignment

Cross-validation fold assignment scheme, if fold_column is not specified. The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> cars_nb = H2ONaiveBayesEstimator(fold_assignment="Random",
...                                  nfolds=5,
...                                  seed=1234)
>>> response = "economy_20mpg"
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> cars_nb.train(x=predictors, y=response, training_frame=cars)
>>> cars_nb.auc()
property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> cars_nb = H2ONaiveBayesEstimator(seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=cars,
...               fold_column="fold_numbers")
>>> cars_nb.auc()
property gainslift_bins

Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning.

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> model = H2ONaiveBayesEstimator(gainslift_bins=20)
>>> model.train(x=["Origin", "Distance"],
...             y="IsDepDelayed",
...             training_frame=airlines)
>>> model.gains_lift()
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator(seed=1234,
...                                  ignore_const_cols=True)
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_nb.auc()
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property keep_cross_validation_fold_assignment

Whether to keep the cross-validation fold assignment.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator(keep_cross_validation_fold_assignment=True,
...                                  nfolds=5,
...                                  seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train)
>>> cars_nb.cross_validation_fold_assignment()
property keep_cross_validation_models

Whether to keep the cross-validation models.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator(keep_cross_validation_models=True,
...                                  nfolds=5,
...                                  seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train)
>>> cars_nb.cross_validation_models()
property keep_cross_validation_predictions

Whether to keep the predictions of the cross-validation models.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator(keep_cross_validation_predictions=True,
...                                  nfolds=5,
...                                  seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train)
>>> cars_nb.cross_validation_predictions()
property laplace

Laplace smoothing parameter

Type: float (default: 0).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
>>> prostate['RACE'] = prostate['RACE'].asfactor()
>>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
>>> prostate['DPROS'] = prostate['DPROS'].asfactor()
>>> prostate_nb = H2ONaiveBayesEstimator(laplace=1)
>>> prostate_nb.train(x=list(range(3,9)),
...                   y=response_col,
...                   training_frame=prostate)
>>> prostate_nb.mse()
property max_after_balance_size

Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes.

Type: float (default: 5).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> max = .85
>>> cov_nb = H2ONaiveBayesEstimator(max_after_balance_size=max,
...                                 seed=1234) 
>>> cov_nb.train(x=predictors,
...              y=response,
...              training_frame=train,
...              validation_frame=valid)
>>> cars_nb.logloss()
property max_confusion_matrix_size

[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs

Type: int (default: 20).

property max_hit_ratio_k

[Deprecated] Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)

Type: int (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator(max_hit_ratio_k=3,
...                                  seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cov_nb.mse()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator(max_runtime_secs=10,
...                                  seed=1234) 
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_nb.auc()
property min_prob

Min. probability to use for observations with not enough data

Type: float (default: 0.001).

Examples

>>> import random
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> problem = random.sample(["binomial","multinomial"],1)
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> if problem == "binomial":
...     response_col = "economy_20mpg"
... else:
...     response_col = "cylinders"
>>> cars[response_col] = cars[response_col].asfactor()
>>> cars_nb = H2ONaiveBayesEstimator(min_prob=0.1,
...                                  eps_prob=0.5,
...                                  seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response_col,
...               training_frame=cars)
>>> cars_nb.show()
property min_sdev

Min. standard deviation to use for observations with not enough data

Type: float (default: 0.001).

Examples

>>> import random
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> problem = random.sample(["binomial","multinomial"],1)
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> if problem == "binomial":
...     response_col = "economy_20mpg"
... else:
...     response_col = "cylinders"
>>> cars[response_col] = cars[response_col].asfactor()
>>> cars_nb = H2ONaiveBayesEstimator(min_sdev=0.1,
...                                  eps_sdev=0.5,
...                                  seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response_col,
...               training_frame=cars)
>>> cars_nb.show()
property nfolds

Number of folds for K-fold cross-validation (0 to disable or >= 2).

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars_nb = H2ONaiveBayesEstimator(nfolds=5,
...                                  seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=cars)
>>> cars_nb.auc()
property response_column

Response variable column.

Type: str.

property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator(score_each_iteration=True,
...                                  seed=1234)
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_nb.auc()
property seed

Seed for pseudo random number generator (only used for cross-validation and fold_assignment=”Random” or “AUTO”)

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> nb_w_seed = H2ONaiveBayesEstimator(seed=1234)
>>> nb_w_seed.train(x=predictors,
...                 y=response,
...                 training_frame=train,
...                  validation_frame=valid)
>>> nb_wo_seed = H2ONaiveBayesEstimator()
>>> nb_wo_seed.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> nb_w_seed.auc()
>>> nb_wo_seed.auc()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator()
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_nb.auc()
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_nb = H2ONaiveBayesEstimator()
>>> cars_nb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_nb.auc()

H2OSupportVectorMachineEstimator

class h2o.estimators.psvm.H2OSupportVectorMachineEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

PSVM

property disable_training_metrics

Disable calculating training metrics (expensive on large datasets)

Type: bool (default: True).

Examples

>>> from h2o.estimators import H2OSupportVectorMachineEstimator
>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.01,
...                                        rank_ratio=0.1,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)
>>> svm.mse()
property fact_threshold

Convergence threshold of the Incomplete Cholesky Factorization (ICF)

Type: float (default: 1e-05).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(disable_training_metrics=False,
...                                        fact_threshold=1e-7)
>>> svm.train(y="C1", training_frame=splice)
>>> svm.mse()
property feasible_threshold

Convergence threshold for primal-dual residuals in the IPM iteration

Type: float (default: 0.001).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(disable_training_metrics=False,
...                                        fact_threshold=1e-7)
>>> svm.train(y="C1", training_frame=splice)
>>> svm.mse()
property gamma

Coefficient of the kernel (currently RBF gamma for gaussian kernel, -1 means 1/#features)

Type: float (default: -1).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.01,
...                                        rank_ratio=0.1,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)
>>> svm.mse()
property hyper_param

Penalty parameter C of the error term

Type: float (default: 1).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.01,
...                                        rank_ratio=0.1,
...                                        hyper_param=0.01,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)
>>> svm.mse()
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.01,
...                                        rank_ratio=0.1,
...                                        ignore_const_cols=False,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)
>>> svm.mse()
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property kernel_type

Type of used kernel

One of: "gaussian" (default: "gaussian").

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.1,
...                                        rank_ratio=0.1,
...                                        hyper_param=0.01,
...                                        kernel_type="gaussian",
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice) 
>>> svm.mse()
property max_iterations

Maximum number of iteration of the algorithm

Type: int (default: 200).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.1,
...                                        rank_ratio=0.1,
...                                        hyper_param=0.01,
...                                        max_iterations=20,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)  
>>> svm.mse()
property mu_factor

Increasing factor mu

Type: float (default: 10).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.1,
...                                        mu_factor=100.5,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice) 
>>> svm.mse()
property negative_weight

Weight of positive (-1) class of observations

Type: float (default: 1).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.1,
...                                        rank_ratio=0.1,
...                                        negative_weight=10,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)  
>>> svm.mse()
property positive_weight

Weight of positive (+1) class of observations

Type: float (default: 1).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.1,
...                                        rank_ratio=0.1,
...                                        positive_weight=0.1,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)   
>>> svm.mse()
property rank_ratio

Desired rank of the ICF matrix expressed as an ration of number of input rows (-1 means use sqrt(#rows)).

Type: float (default: -1).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.01,
...                                        rank_ratio=0.1,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)
>>> svm.mse()
property response_column

Response variable column.

Type: str.

property seed

Seed for pseudo random number generator (if applicable)

Type: int (default: -1).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.1,
...                                        rank_ratio=0.1,
...                                        seed=1234,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice)
>>> svm.model_performance
property surrogate_gap_threshold

Feasibility criterion of the surrogate duality gap (eta)

Type: float (default: 0.001).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.01,
...                                        rank_ratio=0.1,
...                                        surrogate_gap_threshold=0.1,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice) 
>>> svm.mse()
property sv_threshold

Threshold for accepting a candidate observation into the set of support vectors

Type: float (default: 0.0001).

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> svm = H2OSupportVectorMachineEstimator(gamma=0.01,
...                                        rank_ratio=0.1,
...                                        sv_threshold=0.01,
...                                        disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=splice) 
>>> svm.mse()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> train, valid = splice.split_frame(ratios=[0.8])
>>> svm = H2OSupportVectorMachineEstimator(disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=train)
>>> svm.mse()
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> splice = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/splice/splice.svm")
>>> train, valid = splice.split_frame(ratios=[0.8])
>>> svm = H2OSupportVectorMachineEstimator(disable_training_metrics=False)
>>> svm.train(y="C1", training_frame=train, validation_frame=valid)
>>> svm.mse()

H2ORandomForestEstimator

class h2o.estimators.random_forest.H2ORandomForestEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Distributed Random Forest

Builds a Distributed Random Forest (DRF) on a parsed dataset, for regression or classification.

property balance_classes

Balance training data class counts via over/under-sampling (for imbalanced data).

Type: bool (default: False).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_drf = H2ORandomForestEstimator(balance_classes=True,
...                                    seed=1234)
>>> cov_drf.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
property binomial_double_trees

For binary classification: Build 2x as many trees (one per class) - can lead to higher accuracy.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(binomial_double_trees=False,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> print('without binomial_double_trees:',
...        cars_drf.auc(valid=True))
>>> cars_drf_2 = H2ORandomForestEstimator(binomial_double_trees=True,
...                                       seed=1234)
>>> cars_drf_2.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> print('with binomial_double_trees:', cars_drf_2.auc(valid=True))
property build_tree_one_node

Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(build_tree_one_node=True,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_drf.auc(valid=True)
property calibrate_model

Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities.

Type: bool (default: False).

Examples

>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> response = 'Angaus'
>>> predictors = ecology.columns[3:13]
>>> train, calib = ecology.split_frame(seed=12354)
>>> w = h2o.create_frame(binary_fraction=1,
...                      binary_ones_fraction=0.5,
...                      missing_fraction=0,
...                      rows=744, cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_drf = H2ORandomForestEstimator(ntrees=10,
...                                        max_depth=5,
...                                        min_rows=10,
...                                        distribution="multinomial",
...                                        weights_column="weight",
...                                        calibrate_model=True,
...                                        calibration_frame=calib)
>>> ecology_drf.train(x=predictors,
...                   y="Angaus",
...                   training_frame=train)
>>> predicted = ecology_drf.predict(calib)
property calibration_frame

Calibration frame for Platt Scaling

Type: H2OFrame.

Examples

>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> response = 'Angaus'
>>> predictors = ecology.columns[3:13]
>>> train, calib = ecology.split_frame(seed = 12354)
>>> w = h2o.create_frame(binary_fraction=1,
...                      binary_ones_fraction=0.5,
...                      missing_fraction=0,
...                      rows=744, cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_drf = H2ORandomForestEstimator(ntrees=10,
...                                        max_depth=5,
...                                        min_rows=10,
...                                        distribution="multinomial",
...                                        calibrate_model=True,
...                                        calibration_frame=calib)
>>> ecology_drf.train(x=predictors,
...                   y="Angaus,
...                   training_frame=train,
...                   weights_column="weight")
>>> predicted = ecology_drf.predict(train)
property categorical_encoding

Encoding scheme for categorical features

One of: "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") 
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> encoding = "one_hot_explicit"
>>> airlines_drf = H2ORandomForestEstimator(categorical_encoding=encoding,
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_drf.auc(valid=True)
property check_constant_response

Check if response column is constant. If enabled, then an exception is thrown if the response column is a constant value.If disabled, then model will train regardless of the response column being a constant value or not.

Type: bool (default: True).

Examples

>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> train["constantCol"] = 1
>>> my_drf = H2ORandomForestEstimator(check_constant_response=False)
>>> my_drf.train(x=list(range(1,5)),
...              y="constantCol",
...              training_frame=train)
property checkpoint

Model checkpoint to resume training with.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
...                                 seed=1234)
>>> cars_drf = H2ORandomForestEstimator(ntrees=1,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> print(cars_drf.auc(valid=True))
property class_sampling_factors

Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes.

Type: List[float].

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> print(covtype[54].table())
>>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
>>> cov_drf = H2ORandomForestEstimator(balance_classes=True,
...                                    class_sampling_factors=sample_factors,
...                                    seed=1234)
>>> cov_drf.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
property col_sample_rate_change_per_level

Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(col_sample_rate_change_per_level=.9,
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>>  print(airlines_drf.auc(valid=True))
property col_sample_rate_per_tree

Column sample rate per tree (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(col_sample_rate_per_tree=.7,
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_drf.auc(valid=True))
property custom_metric_func

Reference to custom evaluation function, format: language:keyName=funcName

Type: str.

property distribution

[Deprecated] Distribution function

One of: "auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(distribution="poisson",
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_drf.mse(valid=True)
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> from h2o.grid.grid_search import H2OGridSearch
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> response = "IsDepDelayed"
>>> hyper_parameters = {'ntrees': [5,10]}
>>> search_crit = {'strategy': "RandomDiscrete",
...                'max_models': 5,
...                'seed': 1234,
...                'stopping_rounds': 3,
...                'stopping_metric': "AUTO",
...                'stopping_tolerance': 1e-2}
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_grid = H2OGridSearch(H2ORandomForestEstimator,
...                          hyper_params=hyper_parameters,
...                          search_criteria=search_crit)
>>> air_grid.train(x=predictors,
...                y=response,
...                training_frame=airlines,
...                distribution="bernoulli",
...                max_depth=3,
...                export_checkpoints_dir=checkpoints_dir)
>>> num_files = len(listdir(checkpoints_dir))
>>> num_files
property fold_assignment

Cross-validation fold assignment scheme, if fold_column is not specified. The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> assignment_type = "Random"
>>> cars_drf = H2ORandomForestEstimator(fold_assignment=assignment_type,
...                                     nfolds=5,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=cars)
>>> cars_drf.auc(xval=True)
property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> print(cars['fold_numbers'])
>>> cars_drf = H2ORandomForestEstimator(seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=cars,
...                fold_column="fold_numbers")
>>> cars_drf.auc(xval=True)
property gainslift_bins

Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning.

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> model = H2ORandomForestEstimator(ntrees=1, gainslift_bins=20)
>>> model.train(x=["Origin", "Distance"],
...             y="IsDepDelayed",
...             training_frame=airlines)
>>> model.gains_lift()
property histogram_type

What type of histogram to use for finding optimal split points

One of: "auto", "uniform_adaptive", "random", "quantiles_global", "round_robin" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(histogram_type="UniformAdaptive",
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_drf.auc(valid=True))
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(seed=1234,
...                                     ignore_const_cols=True)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_drf.auc(valid=True)
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property keep_cross_validation_fold_assignment

Whether to keep the cross-validation fold assignment.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_fold_assignment=True,
...                                     nfolds=5,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train)
>>> cars_drf.cross_validation_fold_assignment()
property keep_cross_validation_models

Whether to keep the cross-validation models.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_models=True,
...                                     nfolds=5,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train)
>>> cars_drf.auc()
property keep_cross_validation_predictions

Whether to keep the predictions of the cross-validation models.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_predictions=True,
...                                     nfolds=5,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train)
>>> cars_drf.cross_validation_predictions()
property max_after_balance_size

Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes.

Type: float (default: 5).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> print(covtype[54].table())
>>> max = .85
>>> cov_drf = H2ORandomForestEstimator(balance_classes=True,
...                                    max_after_balance_size=max,
...                                    seed=1234)
>>> cov_drf.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
property max_confusion_matrix_size

[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs

Type: int (default: 20).

property max_depth

Maximum tree depth (0 for unlimited).

Type: int (default: 20).

Examples

>>> df = h2o.import_file(path = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> response = "survived"
>>> df[response] = df[response].asfactor()
>>> predictors = df.columns
>>> del predictors[1:3]
>>> train, valid, test = df.split_frame(ratios=[0.6,0.2],
...                                     seed=1234,
...                                     destination_frames=
...                                     ['train.hex','valid.hex','test.hex'])
>>> drf = H2ORandomForestEstimator()
>>> drf.train(x=predictors,
...           y=response,
...           training_frame=train)
>>> perf = drf.model_performance(valid)
>>> print perf.auc()
property max_hit_ratio_k

[Deprecated] Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)

Type: int (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_drf = H2ORandomForestEstimator(max_hit_ratio_k=3,
...                                    seed=1234)
>>> cov_drf.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> cov_drf.show()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(max_runtime_secs=10,
...                                     ntrees=10000,
...                                     max_depth=10,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_drf.auc(valid = True)
property min_rows

Fewest allowed (weighted) observations in a leaf.

Type: float (default: 1).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(min_rows=16,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> print(cars_drf.auc(valid=True))
property min_split_improvement

Minimum relative improvement in squared error reduction for a split to happen

Type: float (default: 1e-05).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(min_split_improvement=1e-3,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> print(cars_drf.auc(valid=True))
property mtries

Number of variables randomly sampled as candidates at each split. If set to -1, defaults to sqrt{p} for classification and p/3 for regression (where p is the # of predictors

Type: int (default: -1).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_drf = H2ORandomForestEstimator(mtries=30, seed=1234)
>>> cov_drf.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
property nbins

For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point

Type: int (default: 20).

Examples

>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [16, 32, 64, 128, 256, 512]
>>> label = ["16", "32", "64", "128", "256", "512"]
>>> for key, num in enumerate(bin_num):
#              Insert integer for 'num' and 'key'
>>> eeg_drf = H2ORandomForestEstimator(nbins=num, seed=1234)
>>> eeg_drf.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print(label[key], 'training score',
...       eeg_drf.auc(train=True))
>>> print(label[key], 'validation score',
...       eeg_drf.auc(train=True))
property nbins_cats

For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting.

Type: int (default: 1024).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [8, 16, 32, 64, 128, 256,
...            512, 1024, 2048, 4096]
>>> label = ["8", "16", "32", "64", "128",
...          "256", "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
#              Insert integer for 'num' and 'key'
>>> airlines_drf = H2ORandomForestEstimator(nbins_cats=num,
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(label[key], 'training score',
...       airlines_gbm.auc(train=True))
>>> print(label[key], 'validation score',
...       airlines_gbm.auc(valid=True))
property nbins_top_level

For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level

Type: int (default: 1024).

Examples

>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8],
...                                seed=1234)
>>> bin_num = [32, 64, 128, 256, 512,
...            1024, 2048, 4096]
>>> label = ["32", "64", "128", "256",
...          "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
#              Insert integer for 'num' and 'key'
>>> eeg_drf = H2ORandomForestEstimator(nbins_top_level=32,
...                                    seed=1234)
>>> eeg_drf.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print(label[key], 'training score',
...       eeg_gbm.auc(train=True))
>>> print(label[key], 'validation score',
...       eeg_gbm.auc(valid=True))
property nfolds

Number of folds for K-fold cross-validation (0 to disable or >= 2).

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> cars_drf = H2ORandomForestEstimator(nfolds=folds,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=cars)
>>> cars_drf.auc(xval=True)
property ntrees

Number of trees.

Type: int (default: 50).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> tree_num = [20, 50, 80, 110,
...             140, 170, 200]
>>> label = ["20", "50", "80", "110",
...          "140", "170", "200"]
>>> for key, num in enumerate(tree_num):
#              Input an integer for 'num' and 'key'
>>> titanic_drf = H2ORandomForestEstimator(ntrees=num,
...                                        seed=1234)
>>> titanic_drf.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(label[key], 'training score',
...       titanic_drf.auc(train=True))
>>> print(label[key], 'validation score',
...       titanic_drf.auc(valid=True))
property offset_column

[Deprecated] Offset column. This will be added to the combination of columns before applying the link function.

Type: str.

property r2_stopping

r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or exceeds this

Type: float (default: 1.797693135e+308).

property response_column

Response variable column.

Type: str.

property sample_rate

Row sample rate per tree (from 0.0 to 1.0)

Type: float (default: 0.632).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(sample_rate=.7,
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_drf.auc(valid=True))
property sample_rate_per_class

A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree

Type: List[float].

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
...                                    seed=1234)
>>> print(train[response].table())
>>> rate_per_class_list = [1, .4, 1, 1, 1, 1, 1]
>>> cov_drf = H2ORandomForestEstimator(sample_rate_per_class=rate_per_class_list,
...                                    seed=1234)
>>> cov_drf.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(score_each_iteration=True,
...                                     ntrees=55,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame = valid)
>>> cars_drf.scoring_history()
property score_tree_interval

Score the model after every so many trees. Disabled if set to 0.

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(score_tree_interval=5,
...                                     seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_drf.scoring_history()
property seed

Seed for pseudo random number generator (if applicable)

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> drf_w_seed_1 = H2ORandomForestEstimator(seed=1234)
>>> drf_w_seed_1.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print('auc for the 1st model build with a seed:',
...        drf_w_seed_1.auc(valid=True))
property stopping_metric

Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client.

One of: "auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc",
...                                         stopping_rounds=3,
...                                         stopping_tolerance=1e-2,
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_drf.auc(valid=True)
property stopping_rounds

Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)

Type: int (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc",
...                                         stopping_rounds=3,
...                                         stopping_tolerance=1e-2,
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_drf.auc(valid=True)
property stopping_tolerance

Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)

Type: float (default: 0.001).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc",
...                                         stopping_rounds=3,
...                                         stopping_tolerance=1e-2,
...                                         seed=1234)
>>> airlines_drf.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_drf.auc(valid=True)
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
...                                 seed=1234)
>>> cars_drf = H2ORandomForestEstimator(seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_drf.auc(valid=True)
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
...                                 seed=1234)
>>> cars_drf = H2ORandomForestEstimator(seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_drf.auc(valid=True)
property weights_column

Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
...                                 seed=1234)
>>> cars_drf = H2ORandomForestEstimator(seed=1234)
>>> cars_drf.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid,
...                weights_column="weight")
>>> cars_drf.auc(valid=True)

H2OStackedEnsembleEstimator

class h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Stacked Ensemble

Builds a stacked ensemble (aka “super learner”) machine learning method that uses two or more H2O learning algorithms to improve predictive performance. It is a loss-based supervised learning method that finds the optimal combination of a collection of prediction algorithms.This method supports regression and binary classification.

Examples

>>> import h2o
>>> h2o.init()
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> col_types = ["numeric", "numeric", "numeric", "enum",
...              "enum", "numeric", "numeric", "numeric", "numeric"]
>>> data = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv", col_types=col_types)
>>> train, test = data.split_frame(ratios=[.8], seed=1)
>>> x = ["CAPSULE","GLEASON","RACE","DPROS","DCAPS","PSA","VOL"]
>>> y = "AGE"
>>> nfolds = 5
>>> gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
...                                    fold_assignment="Modulo",
...                                    keep_cross_validation_predictions=True)
>>> gbm.train(x=x, y=y, training_frame=train)
>>> rf = H2ORandomForestEstimator(nfolds=nfolds,
...                               fold_assignment="Modulo",
...                               keep_cross_validation_predictions=True)
>>> rf.train(x=x, y=y, training_frame=train)
>>> stack = H2OStackedEnsembleEstimator(model_id="ensemble",
...                                     training_frame=train,
...                                     validation_frame=test,
...                                     base_models=[gbm.model_id, rf.model_id])
>>> stack.train(x=x, y=y, training_frame=train, validation_frame=test)
>>> stack.model_performance()
property base_models

List of models or grids (or their ids) to ensemble/stack together. Grids are expanded to individual models. If not using blending frame, then models must have been cross-validated using nfolds > 1, and folds must be identical across models.

Type: List[str] (default: []).

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> col_types = ["numeric", "numeric", "numeric", "enum",
...              "enum", "numeric", "numeric", "numeric", "numeric"]
>>> data = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv", col_types=col_types)
>>> train, test = data.split_frame(ratios=[.8], seed=1)
>>> x = ["CAPSULE","GLEASON","RACE","DPROS","DCAPS","PSA","VOL"]
>>> y = "AGE"
>>> nfolds = 5
>>> gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
...                                    fold_assignment="Modulo",
...                                    keep_cross_validation_predictions=True)
>>> gbm.train(x=x, y=y, training_frame=train)
>>> rf = H2ORandomForestEstimator(nfolds=nfolds,
...                               fold_assignment="Modulo",
...                               keep_cross_validation_predictions=True)
>>> rf.train(x=x, y=y, training_frame=train)
>>> stack = H2OStackedEnsembleEstimator(model_id="ensemble",
...                                     training_frame=train,
...                                     validation_frame=test,
...                                     base_models=[gbm.model_id, rf.model_id])
>>> stack.train(x=x, y=y, training_frame=train, validation_frame=test)
>>> stack.model_performance()
property blending_frame

Frame used to compute the predictions that serve as the training frame for the metalearner (triggers blending mode if provided)

Type: H2OFrame.

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=10,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> import tempfile
>>> from os import listdir
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> checkpoints_dir = tempfile.mkdtemp()
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=10,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           export_checkpoints_dir=checkpoints_dir)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> len(listdir(checkpoints_dir))
property keep_levelone_frame

Keep level one frame used for metalearner training.

Type: bool (default: False).

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=1,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           keep_levelone_frame=True)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
levelone_frame_id()[source]

Fetch the levelone_frame_id for an H2OStackedEnsembleEstimator.

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=10,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           keep_levelone_frame=True)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.levelone_frame_id()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

metalearner()[source]

Print the metalearner of an H2OStackedEnsembleEstimator.

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=10,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           keep_levelone_frame=True)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.metalearner()
property metalearner_algorithm

Type of algorithm to use as the metalearner. Options include ‘AUTO’ (GLM with non negative weights; if validation_frame is present, a lambda search is performed), ‘deeplearning’ (Deep Learning with default parameters), ‘drf’ (Random Forest with default parameters), ‘gbm’ (GBM with default parameters), ‘glm’ (GLM with default parameters), ‘naivebayes’ (NaiveBayes with default parameters), or ‘xgboost’ (if available, XGBoost with default parameters).

One of: "auto", "deeplearning", "drf", "gbm", "glm", "naivebayes", "xgboost" (default: "auto").

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=1,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           metalearner_algorithm="gbm")
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
property metalearner_fold_assignment

Cross-validation fold assignment scheme for metalearner cross-validation. Defaults to AUTO (which is currently set to Random). The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified".

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=1,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           metalearner_fold_assignment="Random")
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
property metalearner_fold_column

Column with cross-validation fold index assignment per observation for cross-validation of the metalearner.

Type: str.

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_test_5k.csv")
>>> fold_column = "fold_id"
>>> train[fold_column] = train.kfold_column(n_folds=3, seed=1)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> x.remove(fold_column)
>>> train[y] = train[y].asfactor()
>>> test[y] = test[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=10,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                     metalearner_fold_column=fold_column,
...                                     metalearner_params=dict(keep_cross_validation_models=True))
>>> stack.train(x=x, y=y, training_frame=train)
>>> stack.model_performance().auc()
property metalearner_nfolds

Number of folds for K-fold cross-validation of the metalearner algorithm (0 to disable or >= 2).

Type: int (default: 0).

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=1,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           metalearner_nfolds=3)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
property metalearner_params

Parameters for metalearner algorithm

Type: dict.

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> gbm_params = {"ntrees" : 100, "max_depth" : 6}
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=1,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           metalearner_algorithm="gbm",
...                                           metalearner_params=gbm_params)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
property offset_column

Offset column. This will be added to the combination of columns before applying the link function.

Type: str.

property response_column

Response variable column.

Type: str.

property score_training_samples

Specify the number of training set samples for scoring. The value must be >= 0. To use all training samples, enter 0.

Type: int (default: 10000).

property seed

Seed for random numbers; passed through to the metalearner algorithm. Defaults to -1 (time-based random number)

Type: int (default: -1).

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=1,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           metalearner_fold_assignment="Random")
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
train(x=None, y=None, training_frame=None, blending_frame=None, verbose=False, **kwargs)[source]

Train the H2O model.

Parameters
  • x – A list of column names or indices indicating the predictor columns.

  • y – An index or a column name indicating the response column.

  • training_frame (H2OFrame) – The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights).

  • offset_column – The name or index of the column in training_frame that holds the offsets.

  • fold_column – The name or index of the column in training_frame that holds the per-row fold assignments.

  • weights_column – The name or index of the column in training_frame that holds the per-row weights.

  • validation_frame – H2OFrame with validation data to be scored on while training.

  • max_runtime_secs (float) – Maximum allowed runtime in seconds for model training. Use 0 to disable.

  • verbose (bool) – Print scoring history to stdout. Defaults to False.

property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, valid = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=1,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           metalearner_fold_assignment="Random")
>>> stack_blend.train(x=x, y=y, training_frame=train, validation_frame=valid)
>>> stack_blend.model_performance(blend).auc()
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, valid = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3 
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
...                                       ntrees=1,
...                                       nfolds=nfolds,
...                                       fold_assignment="Modulo",
...                                       keep_cross_validation_predictions=True,
...                                       seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
...                                  nfolds=nfolds,
...                                  fold_assignment="Modulo",
...                                  keep_cross_validation_predictions=True,
...                                  seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
...                                           seed=1,
...                                           metalearner_fold_assignment="Random")
>>> stack_blend.train(x=x, y=y, training_frame=train, validation_frame=valid)
>>> stack_blend.model_performance(blend).auc()
property weights_column

Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor.

Type: str.

H2OTargetEncoderEstimator

class h2o.estimators.targetencoder.H2OTargetEncoderEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

TargetEncoder

property blending

Blending enabled/disabled

Type: bool (default: False).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> predictors = ["home.dest", "cabin", "embarked"]
>>> response = "survived"
>>> titanic["survived"] = titanic["survived"].asfactor()
>>> fold_col = "kfold_column"
>>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
>>> titanic_te = H2OTargetEncoderEstimator(k=35,
...                                        f=25,
...                                        blending=True)
>>> titanic_te.train(x=predictors,
...                  y=response,
...                  training_frame=titanic)
>>> titanic_te
property data_leakage_handling

Data leakage handling strategy.

One of: "none", "k_fold", "leave_one_out" (default: "none").

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> predictors = ["home.dest", "cabin", "embarked"]
>>> response = "survived"
>>> titanic["survived"] = titanic["survived"].asfactor()
>>> fold_col = "kfold_column"
>>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
>>> titanic_te = H2OTargetEncoderEstimator(k=35,
...                                        f=25,
...                                        data_leakage_handling="k_fold",
...                                        blending=True)
>>> titanic_te.train(x=predictors,
...                  y=response,
...                  training_frame=titanic)
>>> titanic_te
property f

Smoothing. Used for blending (if enabled). Blending is to be enabled separately using the ‘blending’ parameter.

Type: float (default: 20).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> predictors = ["home.dest", "cabin", "embarked"]
>>> response = "survived"
>>> titanic["survived"] = titanic["survived"].asfactor()
>>> fold_col = "kfold_column"
>>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
>>> titanic_te = H2OTargetEncoderEstimator(k=35,
...                                        f=25,
...                                        blending=True)
>>> titanic_te.train(x=predictors,
...                  y=response,
...                  training_frame=titanic)
>>> titanic_te
property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> predictors = ["home.dest", "cabin", "embarked"]
>>> response = "survived"
>>> titanic["survived"] = titanic["survived"].asfactor()
>>> fold_col = "kfold_column"
>>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
>>> titanic_te = H2OTargetEncoderEstimator(k=35,
...                                        f=25,
...                                        blending=True)
>>> titanic_te.train(x=predictors,
...                  y=response,
...                  training_frame=titanic)
>>> titanic_te
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property k

Inflection point. Used for blending (if enabled). Blending is to be enabled separately using the ‘blending’ parameter.

Type: float (default: 10).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> predictors = ["home.dest", "cabin", "embarked"]
>>> response = "survived"
>>> titanic["survived"] = titanic["survived"].asfactor()
>>> fold_col = "kfold_column"
>>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
>>> titanic_te = H2OTargetEncoderEstimator(k=35,
...                                        f=25,
...                                        blending=True)
>>> titanic_te.train(x=predictors,
...                  y=response,
...                  training_frame=titanic)
>>> titanic_te
property noise_level

Noise level

Type: float (default: 0.01).

property response_column

Response variable column.

Type: str.

property seed

Seed for the specified noise level

Type: int (default: -1).

property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> predictors = ["home.dest", "cabin", "embarked"]
>>> response = "survived"
>>> titanic["survived"] = titanic["survived"].asfactor()
>>> fold_col = "kfold_column"
>>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
>>> titanic_te = H2OTargetEncoderEstimator(k=35,
...                                        f=25,
...                                        blending=True)
>>> titanic_te.train(x=predictors,
...                  y=response,
...                  training_frame=titanic)
>>> titanic_te
transform(frame, data_leakage_handling='None', noise=-1, seed=-1)[source]

Apply transformation to te_columns based on the encoding maps generated during train() method call.

Parameters
  • frame (H2OFrame) – to which frame we are applying target encoding transformations.

  • data_leakage_handling (str) –

    Supported options:

    1. ”k_fold” - encodings for a fold are generated based on out-of-fold data.

    2. ”leave_one_out” - leave one out. Current row’s response value is subtracted from the pre-calculated per-level frequencies.

    3. ”none” - we do not holdout anything. Using whole frame for training

  • noise (float) – the amount of random noise added to the target encoding. This helps prevent overfitting. Defaults to 0.01 * range of y.

  • seed (int) – a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.

Example

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> predictors = ["home.dest", "cabin", "embarked"]
>>> response = "survived"
>>> titanic[response] = titanic[response].asfactor()
>>> fold_col = "kfold_column"
>>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
>>> titanic_te = H2OTargetEncoderEstimator(k=35,
...                                        f=25,
...                                        data_leakage_handling="leave_one_out",
...                                        blending=True)
>>> titanic_te.train(x=predictors,
...                  y=response,
...                  training_frame=titanic)
>>> transformed = titanic_te.transform(frame=titanic,
...                                    data_leakage_handling="leave_one_out",
...                                    seed=1234)

H2OXGBoostEstimator

class h2o.estimators.xgboost.H2OXGBoostEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

XGBoost

Builds an eXtreme Gradient Boosting model using the native XGBoost backend.

static available()[source]

Ask the H2O server whether a XGBoost model can be built (depends on availability of native backends). :return: True if a XGBoost model can be built, or False otherwise.

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_xgb = H2OXGBoostEstimator(seed=1234)
>>> boston_xgb.available()
property backend

Backend. By default (auto), a GPU is used if available.

One of: "auto", "gpu", "cpu" (default: "auto").

Examples

>>> pros = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> pros["CAPSULE"] = pros["CAPSULE"].asfactor()
>>> pros_xgb = H2OXGBoostEstimator(tree_method="exact",
...                                seed=123,
...                                backend="cpu")
>>> pros_xgb.train(y="CAPSULE",
...                ignored_columns=["ID"],
...                training_frame=pros)
>>> pros_xgb.auc()
property booster

Booster type

One of: "gbtree", "gblinear", "dart" (default: "gbtree").

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(booster='dart',
...                                   normalize_type="tree",
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property build_tree_one_node

Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.

Type: bool (default: False).

property calibrate_model

Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities.

Type: bool (default: False).

property calibration_frame

Calibration frame for Platt Scaling

Type: H2OFrame.

property categorical_encoding

Encoding scheme for categorical features

One of: "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> encoding = "one_hot_explicit"
>>> airlines_xgb = H2OXGBoostEstimator(categorical_encoding=encoding,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_xgb.auc(valid=True)
property checkpoint

Model checkpoint to resume training with.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","year","economy_20mpg"]
>>> response = "acceleration"
>>> from h2o.estimators import H2OXGBoostEstimator
>>> cars_xgb = H2OXGBoostEstimator(seed=1234)
>>> train, valid = cars.split_frame(ratios=[.8])
>>> cars_xgb.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_xgb.mse()
>>> cars_xgb_continued = H2OXGBoostEstimator(checkpoint=cars_xgb.model_id,
...                                          ntrees=51,
...                                          seed=1234)
>>> cars_xgb_continued.train(x=predictors,
...                          y=response,
...                          training_frame=train,
...                          validation_frame=valid)
>>> cars_xgb_continued.mse()
property col_sample_rate

(same as colsample_bylevel) Column sample rate (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate=.7,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property col_sample_rate_per_tree

(same as colsample_bytree) Column sample rate per tree (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate_per_tree=.7,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property colsample_bylevel

(same as col_sample_rate) Column sample rate (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate=.7,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property colsample_bynode

Column sample rate per tree node (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(colsample_bynode=.5,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors, y=response,
...                    training_frame=train, validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property colsample_bytree

(same as col_sample_rate_per_tree) Column sample rate per tree (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate_per_tree=.7,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property distribution

Distribution function

One of: "auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber" (default: "auto").

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8],
...                                 seed=1234)
>>> cars_xgb = H2OXGBoostEstimator(distribution="poisson",
...                                seed=1234)
>>> cars_xgb.train(x=predictors,
...                y=response,
...                training_frame=train,
...                validation_frame=valid)
>>> cars_xgb.mse(valid=True)
property dmatrix_type

Type of DMatrix. For sparse, NAs and 0 are treated equally.

One of: "auto", "dense", "sparse" (default: "auto").

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_xgb = H2OXGBoostEstimator(dmatrix_type="auto",
...                                  seed=1234)
>>> boston_xgb.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_xgb.mse()
property eta

(same as learn_rate) Learning rate (from 0.0 to 1.0)

Type: float (default: 0.3).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(ntrees=10000,
...                                   learn_rate=0.01,
...                                   stopping_rounds=5,
...                                   stopping_metric="AUC",
...                                   stopping_tolerance=1e-4,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>>  print(titanic_xgb.auc(valid=True))
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from h2o.grid.grid_search import H2OGridSearch
>>> from os import listdir
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> response = "IsDepDelayed"
>>> hyper_parameters = {'ntrees': [5,10]}
>>> search_crit = {'strategy': "RandomDiscrete",
...                'max_models': 5,
...                'seed': 1234,
...                'stopping_rounds': 3,
...                'stopping_metric': "AUTO",
...                'stopping_tolerance': 1e-2}
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_grid = H2OGridSearch(H2OXGBoostEstimator,
...                          hyper_params=hyper_parameters,
...                          search_criteria=search_crit)
>>> air_grid.train(x=predictors,
...                y=response,
...                training_frame=airlines,
...                distribution="bernoulli",
...                learn_rate=0.1,
...                max_depth=3,
...                export_checkpoints_dir=checkpoints_dir)
>>> len(listdir(checkpoints_dir))
property fold_assignment

Cross-validation fold assignment scheme, if fold_column is not specified. The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified" (default: "auto").

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> assignment_type = "Random"
>>> titanic_xgb = H2OXGBoostEstimator(fold_assignment=assignment_type,
...                                   nfolds=5,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=titanic)
>>> titanic_xgb.auc(xval=True)
property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> fold_numbers = titanic.kfold_column(n_folds=5,
...                                     seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> titanic = titanic.cbind(fold_numbers)
>>> print(titanic['fold_numbers'])
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=titanic,
...                   fold_column="fold_numbers")
>>> titanic_xgb.auc(xval=True)
property gainslift_bins

Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning.

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> model = H2OXGBoostEstimator(ntrees=1, gainslift_bins=20)
>>> model.train(x=["Origin", "Distance"],
...             y="IsDepDelayed",
...             training_frame=airlines)
>>> model.gains_lift()
property gamma

(same as min_split_improvement) Minimum relative improvement in squared error reduction for a split to happen

Type: float (default: 0).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(min_split_improvement=1e-3,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property gpu_id

Which GPU to use.

Type: int (default: 0).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_xgb = H2OXGBoostEstimator(gpu_id=0,
...                                  seed=1234)
>>> boston_xgb.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> boston_xgb.mse()
property grow_policy

Grow policy - depthwise is standard GBM, lossguide is LightGBM

One of: "depthwise", "lossguide" (default: "depthwise").

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> titanic["const_1"] = 6
>>> titanic["const_2"] = 7
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234,
...                                   grow_policy="depthwise")
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> titanic_xgb.auc(valid=True)
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> titanic["const_1"] = 6
>>> titanic["const_2"] = 7
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234,
...                                   ignore_const_cols=True)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> titanic_xgb.auc(valid=True)
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property keep_cross_validation_fold_assignment

Whether to keep the cross-validation fold assignment.

Type: bool (default: False).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_fold_assignment=True,
...                                   nfolds=5,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train)
>>> titanic_xgb.cross_validation_fold_assignment()
property keep_cross_validation_models

Whether to keep the cross-validation models.

Type: bool (default: True).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_models=True,
...                                   nfolds=5 ,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train)
>>> titanic_xgb.cross_validation_models()
property keep_cross_validation_predictions

Whether to keep the predictions of the cross-validation models.

Type: bool (default: False).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_predictions=True,
...                                   nfolds=5,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train)
>>> titanic_xgb.cross_validation_predictions()
property learn_rate

(same as eta) Learning rate (from 0.0 to 1.0)

Type: float (default: 0.3).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(ntrees=10000,
...                                   learn_rate=0.01,
...                                   stopping_rounds=5,
...                                   stopping_metric="AUC",
...                                   stopping_tolerance=1e-4,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property max_abs_leafnode_pred

(same as max_delta_step) Maximum absolute value of a leaf node prediction

Type: float (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
...                                    seed=1234)
>>> cov_xgb = H2OXGBoostEstimator(max_abs_leafnode_pred=float(2),
...                               seed=1234)
>>> cov_xgb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print(cov_xgb.logloss(valid=True))
property max_bins

For tree_method=hist only: maximum number of bins

Type: int (default: 256).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
...                                    seed=1234)
>>> cov_xgb = H2OXGBoostEstimator(max_bins=200,
...                               seed=1234)
>>> cov_xgb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print(cov_xgb.logloss(valid=True))
property max_delta_step

(same as max_abs_leafnode_pred) Maximum absolute value of a leaf node prediction

Type: float (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
...                                    seed=1234)
>>> cov_xgb = H2OXGBoostEstimator(max_delta_step=float(2),
...                               seed=1234)
>>> cov_xgb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print(cov_xgb.logloss(valid=True))
property max_depth

Maximum tree depth (0 for unlimited).

Type: int (default: 6).

Examples

>>> df = h2o.import_file(path = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> response = "survived"
>>> df[response] = df[response].asfactor()
>>> predictors = df.columns
>>> del predictors[1:3]
>>> train, valid, test = df.split_frame(ratios=[0.6,0.2],
...                                     seed=1234,
...                                     destination_frames=
...                                     ['train.hex',
...                                     'valid.hex',
...                                     'test.hex'])
>>> xgb = H2OXGBoostEstimator()
>>> xgb.train(x=predictors,
...           y=response,
...           training_frame=train)
>>> perf = xgb.model_performance(valid)
>>> print perf.auc()
property max_leaves

For tree_method=hist only: maximum number of leaves

Type: int (default: 0).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(max_leaves=0, seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
...                                    seed=1234)
>>> cov_xgb = H2OXGBoostEstimator(max_runtime_secs=10,
...                               ntrees=10000,
...                               max_depth=10,
...                               seed=1234)
>>> cov_xgb.train(x=predictors,
...               y=response,
...               training_frame=train,
...               validation_frame=valid)
>>> print(cov_xgb.logloss(valid=True))
property min_child_weight

(same as min_rows) Fewest allowed (weighted) observations in a leaf.

Type: float (default: 1).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(min_child_weight=16,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property min_rows

(same as min_child_weight) Fewest allowed (weighted) observations in a leaf.

Type: float (default: 1).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(min_rows=16,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property min_split_improvement

(same as gamma) Minimum relative improvement in squared error reduction for a split to happen

Type: float (default: 0).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(min_split_improvement=0.55,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property monotone_constraints

A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint.

Type: dict.

Examples

>>> prostate_hex = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate_hex["CAPSULE"] = prostate_hex["CAPSULE"].asfactor()
>>> response = "CAPSULE"
>>> seed=42
>>> monotone_constraints={"AGE":1}
>>> xgb_model = H2OXGBoostEstimator(seed=seed,
...                                 monotone_constraints=monotone_constraints)
>>> xgb_model.train(y=response,
...                 ignored_columns=["ID"],
...                 training_frame=prostate_hex)
>>> xgb_model.scoring_history()
property nfolds

Number of folds for K-fold cross-validation (0 to disable or >= 2).

Type: int (default: 0).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> folds = 5
>>> titanic_xgb = H2OXGBoostEstimator(nfolds=folds,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=titanic)
>>> titanic_xgb.auc(xval=True)
property normalize_type

For booster=dart only: normalize_type

One of: "tree", "forest" (default: "tree").

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(booster='dart',
...                                   normalize_type="tree",
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property nthread

Number of parallel threads that can be used to run XGBoost. Cannot exceed H2O cluster limits (-nthreads parameter). Defaults to maximum available

Type: int (default: -1).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> thread = 4
>>> titanic_xgb = H2OXGBoostEstimator(nthread=thread,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=titanic)
>>> print(titanic_xgb.auc(train=True))
property ntrees

(same as n_estimators) Number of trees.

Type: int (default: 50).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> tree_num = [20, 50, 80, 110, 140, 170, 200]
>>> label = ["20", "50", "80", "110",
...          "140", "170", "200"]
>>> for key, num in enumerate(tree_num):
#              Input integer for 'num' and 'key'
>>> titanic_xgb = H2OXGBoostEstimator(ntrees=num,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(label[key], 'training score',
...       titanic_xgb.auc(train=True))
>>> print(label[key], 'validation score',
...       titanic_xgb.auc(valid=True))
property offset_column

Offset column. This will be added to the combination of columns before applying the link function.

Type: str.

property one_drop

For booster=dart only: one_drop

Type: bool (default: False).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(booster='dart',
...                                   one_drop=True,
...                                   seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property quiet_mode

Enable quiet mode

Type: bool (default: True).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234, quiet_mode=True)
>>> titanic_xgb.train(x=predictors
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> titanic_xgb.mse(valid=True)
property rate_drop

For booster=dart only: rate_drop (0..1)

Type: float (default: 0).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(rate_drop=0.1, seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> print(titanic_xgb.auc(valid=True))
property reg_alpha

L1 regularization

Type: float (default: 0).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> response = "medv"
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_xgb = H2OXGBoostEstimator(reg_alpha=.25)
>>> boston_xgb.train(x=predictors,
...                  y=response,
...                  training_frame=train,
...                  validation_frame=valid)
>>> print(boston_xgb.mse(valid=True))
property reg_lambda

L2 regularization

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8])
>>> airlines_xgb = H2OXGBoostEstimator(reg_lambda=.0001,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property response_column

Response variable column.

Type: str.

property sample_rate

(same as subsample) Row sample rate per tree (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(sample_rate=.7,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property sample_type

For booster=dart only: sample_type

One of: "uniform", "weighted" (default: "uniform").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"]= airlines["Year"].asfactor()
>>> airlines["Month"]= airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(sample_type="weighted",
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property save_matrix_directory

Directory where to save matrices passed to XGBoost library. Useful for debugging.

Type: str.

property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(score_each_iteration=True,
...                                    ntrees=55,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_xgb.scoring_history()
property score_tree_interval

Score the model after every so many trees. Disabled if set to 0.

Type: int (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(score_tree_interval=5,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_xgb.scoring_history()
property seed

Seed for pseudo random number generator (if applicable)

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> xgb_w_seed_1 = H2OXGBoostEstimator(col_sample_rate=.7,
...                                    seed=1234)
>>> xgb_w_seed_1.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> xgb_w_seed_2 = H2OXGBoostEstimator(col_sample_rate = .7,
...                                    seed = 1234)
>>> xgb_w_seed_2.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print('auc for the 1st model built with a seed:',
...        xgb_w_seed_1.auc(valid=True))
>>> print('auc for the 2nd model built with a seed:',
...        xgb_w_seed_2.auc(valid=True))
property skip_drop

For booster=dart only: skip_drop (0..1)

Type: float (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(skip_drop=0.5,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train)
>>> airlines_xgb.auc(train=True)
property stopping_metric

Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client.

One of: "auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc",
...                                    stopping_rounds=3,
...                                    stopping_tolerance=1e-2,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_xgb.auc(valid=True)
property stopping_rounds

Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)

Type: int (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc",
...                                    stopping_rounds=3,
...                                    stopping_tolerance=1e-2,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_xgb.auc(valid=True)
property stopping_tolerance

Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)

Type: float (default: 0.001).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc",
...                                    stopping_rounds=3,
...                                    stopping_tolerance=1e-2,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> airlines_xgb.auc(valid=True)
property subsample

(same as sample_rate) Row sample rate per tree (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> airlines_xgb = H2OXGBoostEstimator(sample_rate=.7,
...                                    seed=1234)
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> titanic_xgb.auc(valid=True)
property tree_method

Tree method

One of: "auto", "exact", "approx", "hist" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
...                                    seed=1234)
>>> >>> airlines_xgb = H2OXGBoostEstimator(seed=1234,
...                                        tree_method="approx")
>>> airlines_xgb.train(x=predictors,
...                    y=response,
...                    training_frame=train,
...                    validation_frame=valid)
>>> print(airlines_xgb.auc(valid=True))
property tweedie_power

Tweedie power for Tweedie regression, must be between 1 and 2.

Type: float (default: 1.5).

Examples

>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> train, valid = insurance.split_frame(ratios=[.8],
...                                      seed=1234)
>>> insurance_xgb = H2OXGBoostEstimator(distribution="tweedie",
...                                     tweedie_power=1.2,
...                                     seed=1234)
>>> insurance_xgb.train(x=predictors,
...                     y=response,
...                     training_frame=train,
...                     validation_frame=valid)
>>> print(insurance_xgb.mse(valid=True))
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance['Group'] = insurance['Group'].asfactor()
>>> insurance['Age'] = insurance['Age'].asfactor()
>>> predictors = insurance.columns[0:4]
>>> response = 'Claims'
>>> train, valid = insurance.split_frame(ratios=[.8],
...                                      seed=1234)
>>> insurance_xgb = H2OXGBoostEstimator(seed=1234)
>>> insurance_xgb.train(x=predictors,
...                     y=response,
...                     training_frame=train,
...                     validation_frame=valid)
>>> print(insurance_xgb.mse(valid=True))
property weights_column

Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor.

Type: str.

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
...                                    seed=1234)
>>> titanic_xgb = H2OXGBoostEstimator(seed=1234)
>>> titanic_xgb.train(x=predictors,
...                   y=response,
...                   training_frame=train,
...                   validation_frame=valid)
>>> titanic_xgb.auc(valid=True)

Unsupervised

H2OAggregatorEstimator

class h2o.estimators.aggregator.H2OAggregatorEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Aggregator

property categorical_encoding

Encoding scheme for categorical features

One of: "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited" (default: "auto").

Examples

>>> df = h2o.create_frame(rows=10000,
...                       cols=10,
...                       categorical_fraction=0.6,
...                       integer_fraction=0,
...                       binary_fraction=0,
...                       real_range=100,
...                       integer_range=100,
...                       missing_fraction=0,
...                       factors=100,
...                       seed=1234)
>>> params = {"target_num_exemplars": 1000,
...           "rel_tol_num_exemplars": 0.5,
...           "categorical_encoding": "eigen"}
>>> agg = H2OAggregatorEstimator(**params)
>>> agg.train(training_frame=df)
>>> new_df = agg.aggregated_frame
>>> new_df
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> checkpoints_dir = tempfile.mkdtemp()
>>> model = H2OAggregatorEstimator(target_num_exemplars=500, 
...                                rel_tol_num_exemplars=0.3,
...                                export_checkpoints_dir=checkpoints_dir)
>>> model.train(training_frame=df)
>>> new_df = model.aggregated_frame
>>> new_df
>>> len(listdir(checkpoints_dir))
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> params = {"ignore_const_cols": False,
...           "target_num_exemplars": 500,
...           "rel_tol_num_exemplars": 0.3,
...           "transform": "standardize",
...           "categorical_encoding": "eigen"}
>>> model = H2OAggregatorEstimator(**params)
>>> model.train(training_frame=df)
>>> new_df = model.aggregated_frame
>>> new_df
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property num_iteration_without_new_exemplar

The number of iterations to run before aggregator exits if the number of exemplars collected didn’t change

Type: int (default: 500).

Examples

>>> df = h2o.create_frame(rows=10000,
...                       cols=10,
...                       categorical_fraction=0.6,
...                       integer_fraction=0,
...                       binary_fraction=0,
...                       real_range=100,
...                       integer_range=100,
...                       missing_fraction=0,
...                       factors=100,
...                       seed=1234)
>>> params = {"target_num_exemplars": 1000,
...           "rel_tol_num_exemplars": 0.5,
...           "categorical_encoding": "eigen",
...           "num_iteration_without_new_exemplar": 400}
>>> agg = H2OAggregatorEstimator(**params)
>>> agg.train(training_frame=df)
>>> new_df = agg.aggregated_frame
>>> new_df
property rel_tol_num_exemplars

Relative tolerance for number of exemplars (e.g, 0.5 is +/- 50 percents)

Type: float (default: 0.5).

Examples

>>> df = h2o.create_frame(rows=10000,
...                       cols=10,
...                       categorical_fraction=0.6,
...                       integer_fraction=0,
...                       binary_fraction=0,
...                       real_range=100,
...                       integer_range=100,
...                       missing_fraction=0,
...                       factors=100,
...                       seed=1234)
>>> params = {"target_num_exemplars": 1000,
...           "rel_tol_num_exemplars": 0.5,
...           "categorical_encoding": "eigen",
...           "num_iteration_without_new_exemplar": 400}
>>> agg = H2OAggregatorEstimator(**params)
>>> agg.train(training_frame=df)
>>> new_df = agg.aggregated_frame
>>> new_df
property response_column

Response variable column.

Type: str.

property save_mapping_frame

Whether to export the mapping of the aggregated frame

Type: bool (default: False).

Examples

>>> df = h2o.create_frame(rows=10000,
...                       cols=10,
...                       categorical_fraction=0.6,
...                       integer_fraction=0,
...                       binary_fraction=0,
...                       real_range=100,
...                       integer_range=100,
...                       missing_fraction=0,
...                       factors=100,
...                       seed=1234)
>>> params = {"target_num_exemplars": 1000,
...           "rel_tol_num_exemplars": 0.5,
...           "categorical_encoding": "eigen",
...           "save_mapping_frame": True}
>>> agg = H2OAggregatorEstimator(**params)
>>> agg.train(training_frame=df)
>>> mapping_frame = agg.mapping_frame
>>> mapping_frame
property target_num_exemplars

Targeted number of exemplars

Type: int (default: 5000).

Examples

>>> df = h2o.create_frame(rows=10000,
...                       cols=10,
...                       categorical_fraction=0.6,
...                       integer_fraction=0,
...                       binary_fraction=0,
...                       real_range=100,
...                       integer_range=100,
...                       missing_fraction=0,
...                       factors=100,
...                       seed=1234)
>>> params = {"target_num_exemplars": 1000,
...           "rel_tol_num_exemplars": 0.5,
...           "categorical_encoding": "eigen",
...           "num_iteration_without_new_exemplar": 400}
>>> agg = H2OAggregatorEstimator(**params)
>>> agg.train(training_frame=df)
>>> new_df = agg.aggregated_frame
>>> new_df
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> df = h2o.create_frame(rows=10000,
...                       cols=10,
...                       categorical_fraction=0.6,
...                       integer_fraction=0,
...                       binary_fraction=0,
...                       real_range=100,
...                       integer_range=100,
...                       missing_fraction=0,
...                       factors=100,
...                       seed=1234)
>>> params = {"target_num_exemplars": 1000,
...           "rel_tol_num_exemplars": 0.5,
...           "categorical_encoding": "eigen",
...           "num_iteration_without_new_exemplar": 400}
>>> agg = H2OAggregatorEstimator(**params)
>>> agg.train(training_frame=df)
>>> new_df = agg.aggregated_frame
>>> new_df
property transform

Transformation of training data

One of: "none", "standardize", "normalize", "demean", "descale" (default: "normalize").

Examples

>>> df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> params = {"ignore_const_cols": False,
...           "target_num_exemplars": 500,
...           "rel_tol_num_exemplars": 0.3,
...           "transform": "standardize",
...           "categorical_encoding": "eigen"}
>>> model = H2OAggregatorEstimator(**params)
>>> model.train(training_frame=df)
>>> new_df = model.aggregated_frame

H2OAutoEncoderEstimator

class h2o.estimators.deeplearning.H2OAutoEncoderEstimator(**kwargs)[source]

Bases: h2o.estimators.deeplearning.H2ODeepLearningEstimator

Examples

>>> import h2o as ml
>>> from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
>>> ml.init()
>>> rows = [[1,2,3,4,0]*50, [2,1,2,4,1]*50, [2,1,4,2,1]*50, [0,1,2,34,1]*50, [2,3,4,1,0]*50]
>>> fr = ml.H2OFrame(rows)
>>> fr[4] = fr[4].asfactor()
>>> model = H2OAutoEncoderEstimator()
>>> model.train(x=range(4), training_frame=fr)

H2OGenericEstimator

class h2o.estimators.generic.H2OGenericEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Import MOJO Model

static from_file(file=<class 'str'>)[source]

Creates new Generic model by loading existing embedded model into library, e.g. from H2O MOJO. The imported model must be supported by H2O.

Parameters

file – A string containing path to the file to create the model from

Returns

H2OGenericEstimator instance representing the generic model

Examples

>>> from h2o.estimators import H2OIsolationForestEstimator, H2OGenericEstimator
>>> import tempfile
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> ifr = H2OIsolationForestEstimator(ntrees=1)
>>> ifr.train(x=["Origin","Dest"], y="Distance", training_frame=airlines)
>>> original_model_filename = tempfile.mkdtemp()
>>> original_model_filename = ifr.download_mojo(original_model_filename)
>>> model = H2OGenericEstimator.from_file(original_model_filename)
>>> model.model_performance()
property model_key

Key to the self-contained model archive already uploaded to H2O.

Type: H2OFrame.

Examples

>>> from h2o.estimators import H2OGenericEstimator, H2OXGBoostEstimator
>>> import tempfile
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> y = "IsDepDelayed"
>>> x = ["fYear","fMonth","Origin","Dest","Distance"]
>>> xgb = H2OXGBoostEstimator(ntrees=1, nfolds=3)
>>> xgb.train(x=x, y=y, training_frame=airlines)
>>> original_model_filename = tempfile.mkdtemp()
>>> original_model_filename = xgb.download_mojo(original_model_filename)
>>> key = h2o.lazy_import(original_model_filename)
>>> fr = h2o.get_frame(key[0])
>>> model = H2OGenericEstimator(model_key=fr)
>>> model.train()
>>> model.auc()
property path

Path to file with self-contained model archive.

Type: str.

Examples

>>> from h2o.estimators import H2OIsolationForestEstimator, H2OGenericEstimator
>>> import tempfile
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> ifr = H2OIsolationForestEstimator(ntrees=1)
>>> ifr.train(x=["Origin","Dest"], y="Distance", training_frame=airlines)
>>> generic_mojo_filename = tempfile.mkdtemp("zip","genericMojo")
>>> generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
>>> model = H2OGenericEstimator.from_file(generic_mojo_filename)
>>> model.model_performance()

H2OGeneralizedLowRankEstimator

class h2o.estimators.glrm.H2OGeneralizedLowRankEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Generalized Low Rank Modeling

Builds a generalized low rank model of a H2O dataset.

property expand_user_y

Expand categorical columns in user-specified initial Y

Type: bool (default: True).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> rank = 3
>>> gx = 0.5
>>> gy = 0.5
>>> trans = "standardize"
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=rank,
...                                            loss="Quadratic",
...                                            gamma_x=gx,
...                                            gamma_y=gy,
...                                            transform=trans,
...                                            expand_user_y=False)
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> iris_glrm.show()
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> checkpoints_dir = tempfile.mkdtemp()
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                            export_checkpoints_dir=checkpoints_dir,
...                                            seed=1234)
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> len(listdir(checkpoints_dir))
property gamma_x

Regularization weight on X matrix

Type: float (default: 0).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> rank = 3
>>> gx = 0.5
>>> gy = 0.5
>>> trans = "standardize"
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=rank,
...                                            loss="Quadratic",
...                                            gamma_x=gx,
...                                            gamma_y=gy,
...                                            transform=trans)
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> iris_glrm.show()
property gamma_y

Regularization weight on Y matrix

Type: float (default: 0).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> rank = 3
>>> gx = 0.5
>>> gy = 0.5
>>> trans = "standardize"
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=rank,
...                                            loss="Quadratic",
...                                            gamma_x=gx,
...                                            gamma_y=gy,
...                                            transform=trans)
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> iris_glrm.show()
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                            ignore_const_cols=False,
...                                            seed=1234)
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> iris_glrm.show()
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property impute_original

Reconstruct original training data by reversing transform

Type: bool (default: False).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> rank = 3
>>> gx = 0.5
>>> gy = 0.5
>>> trans = "standardize"
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=rank,
...                                            loss="Quadratic",
...                                            gamma_x=gx,
...                                            gamma_y=gy,
...                                            transform=trans
...                                            impute_original=True)
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> iris_glrm.show()
property init

Initialization mode

One of: "random", "svd", "plus_plus", "user" (default: "plus_plus").

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                            init="svd",
...                                            seed=1234) 
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> iris_glrm.show()
property init_step_size

Initial step size

Type: float (default: 1).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                            init_step_size=2.5,
...                                            seed=1234) 
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> iris_glrm.show()
property k

Rank of matrix approximation

Type: int (default: 1).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=3)
>>> iris_glrm.train(x=iris.names, training_frame=iris)
>>> iris_glrm.show()
property loading_name

[Deprecated] Use representation_name instead. Frame key to save resulting X.

Type: str.

Examples

>>> # loading_name will be deprecated.  Use representation_name instead.    
>>> acs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip")
>>> acs_fill = acs.drop("ZCTA5")
>>> acs_glrm = H2OGeneralizedLowRankEstimator(k=10,
...                                           transform="standardize",
...                                           loss="quadratic",
...                                           regularization_x="quadratic",
...                                           regularization_y="L1",
...                                           gamma_x=0.25,
...                                           gamma_y=0.5,
...                                           max_iterations=1,
...                                           loading_name="acs_full")
>>> acs_glrm.train(x=acs_fill.names, training_frame=acs)
>>> acs_glrm.loading_name
>>> acs_glrm.show()
property loss

Numeric loss function

One of: "quadratic", "absolute", "huber", "poisson", "hinge", "logistic", "periodic" (default: "quadratic").

Examples

>>> acs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip")
>>> acs_fill = acs.drop("ZCTA5")
>>> acs_glrm = H2OGeneralizedLowRankEstimator(k=10,
...                                           transform="standardize",
...                                           loss="absolute",
...                                           regularization_x="quadratic",
...                                           regularization_y="L1",
...                                           gamma_x=0.25,
...                                           gamma_y=0.5,
...                                           max_iterations=700)
>>> acs_glrm.train(x=acs_fill.names, training_frame=acs)
>>> acs_glrm.show()
property loss_by_col

Loss function by column (override)

Type: List[Enum["quadratic", "absolute", "huber", "poisson", "hinge", "logistic", "periodic", "categorical", "ordinal"]].

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               loss="quadratic",
...                                               loss_by_col=["absolute","huber"],
...                                               loss_by_col_idx=[0,3],
...                                               regularization_x="quadratic",
...                                               regularization_y="l1")
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property loss_by_col_idx

Loss function by column index (override)

Type: List[int].

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               loss="quadratic",
...                                               loss_by_col=["absolute","huber"],
...                                               loss_by_col_idx=[0,3],
...                                               regularization_x="quadratic",
...                                               regularization_y="l1")
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property max_iterations

Maximum number of iterations

Type: int (default: 1000).

Examples

>>> acs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip")
>>> acs_fill = acs.drop("ZCTA5")
>>> acs_glrm = H2OGeneralizedLowRankEstimator(k=10,
...                                           transform="standardize",
...                                           loss="quadratic",
...                                           regularization_x="quadratic",
...                                           regularization_y="L1",
...                                           gamma_x=0.25,
...                                           gamma_y=0.5,
...                                           max_iterations=700)
>>> acs_glrm.train(x=acs_fill.names, training_frame=acs)
>>> acs_glrm.show()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               max_runtime_secs=15,
...                                               max_iterations=500,
...                                               max_updates=900,
...                                               min_step_size=0.005)
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property max_updates

Maximum number of updates, defaults to 2*max_iterations

Type: int (default: 2000).

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               max_runtime_secs=15,
...                                               max_iterations=500,
...                                               max_updates=900,
...                                               min_step_size=0.005)
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property min_step_size

Minimum step size

Type: float (default: 0.0001).

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               max_runtime_secs=15,
...                                               max_iterations=500,
...                                               max_updates=900,
...                                               min_step_size=0.005)
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property multi_loss

Categorical loss function

One of: "categorical", "ordinal" (default: "categorical").

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               loss="quadratic",
...                                               loss_by_col=["absolute","huber"],
...                                               loss_by_col_idx=[0,3],
...                                               regularization_x="quadratic",
...                                               regularization_y="l1"
...                                               multi_loss="ordinal")
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property period

Length of period (only used with periodic loss function)

Type: int (default: 1).

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               max_runtime_secs=15,
...                                               max_iterations=500,
...                                               max_updates=900,
...                                               min_step_size=0.005,
...                                               period=5)
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property recover_svd

Recover singular values and eigenvectors of XY

Type: bool (default: False).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_cat.csv")
>>> prostate[0] = prostate[0].asnumeric()
>>> prostate[4] = prostate[4].asnumeric()
>>> loss_all = ["Hinge", "Quadratic", "Categorical", "Categorical",
...             "Hinge", "Quadratic", "Quadratic", "Quadratic"]
>>> pros_glrm = H2OGeneralizedLowRankEstimator(k=5,
...                                            loss_by_col=loss_all,
...                                            recover_svd=True,
...                                            transform="standardize",
...                                            seed=12345)
>>> pros_glrm.train(x=prostate.names, training_frame=prostate)
>>> pros_glrm.show()
property regularization_x

Regularization function for X matrix

One of: "none", "quadratic", "l2", "l1", "non_negative", "one_sparse", "unit_one_sparse", "simplex" (default: "none").

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               loss="quadratic",
...                                               loss_by_col=["absolute","huber"],
...                                               loss_by_col_idx=[0,3],
...                                               regularization_x="quadratic",
...                                               regularization_y="l1")
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property regularization_y

Regularization function for Y matrix

One of: "none", "quadratic", "l2", "l1", "non_negative", "one_sparse", "unit_one_sparse", "simplex" (default: "none").

Examples

>>> arrestsH2O = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/pca_test/USArrests.csv")
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                               loss="quadratic",
...                                               loss_by_col=["absolute","huber"],
...                                               loss_by_col_idx=[0,3],
...                                               regularization_x="quadratic",
...                                               regularization_y="l1")
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property representation_name

Frame key to save resulting X

Type: str.

Examples

>>> acs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip")
>>> acs_fill = acs.drop("ZCTA5")
>>> acs_glrm = H2OGeneralizedLowRankEstimator(k=10,
...                                           transform="standardize",
...                                           loss="quadratic",
...                                           regularization_x="quadratic",
...                                           regularization_y="L1",
...                                           gamma_x=0.25,
...                                           gamma_y=0.5,
...                                           max_iterations=1,
...                                           representation_name="acs_full")
>>> acs_glrm.train(x=acs_fill.names, training_frame=acs)
>>> acs_glrm.loading_name
>>> acs_glrm.show()
property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_cat.csv")
>>> prostate[0] = prostate[0].asnumeric()
>>> prostate[4] = prostate[4].asnumeric()
>>> loss_all = ["Hinge", "Quadratic", "Categorical", "Categorical",
...             "Hinge", "Quadratic", "Quadratic", "Quadratic"]
>>> pros_glrm = H2OGeneralizedLowRankEstimator(k=5,
...                                            loss_by_col=loss_all,
...                                            score_each_iteration=True,
...                                            transform="standardize",
...                                            seed=12345)
>>> pros_glrm.train(x=prostate.names, training_frame=prostate)
>>> pros_glrm.show()
property seed

RNG seed for initialization

Type: int (default: -1).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_cat.csv")
>>> prostate[0] = prostate[0].asnumeric()
>>> prostate[4] = prostate[4].asnumeric()
>>> glrm_w_seed = H2OGeneralizedLowRankEstimator(k=5, seed=12345) 
>>> glrm_w_seed.train(x=prostate.names, training_frame=prostate)
>>> glrm_wo_seed = H2OGeneralizedLowRankEstimator(k=5, 
>>> glrm_wo_seed.train(x=prostate.names, training_frame=prostate)
>>> glrm_w_seed.show()
>>> glrm_wo_seed.show()
property svd_method

Method for computing SVD during initialization (Caution: Randomized is currently experimental and unstable)

One of: "gram_s_v_d", "power", "randomized" (default: "randomized").

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_cat.csv")
>>> prostate[0] = prostate[0].asnumeric()
>>> prostate[4] = prostate[4].asnumeric()
>>> pros_glrm = H2OGeneralizedLowRankEstimator(k=5,
...                                            svd_method="power",
...                                            seed=1234)
>>> pros_glrm.train(x=prostate.names, training_frame=prostate)
>>> pros_glrm.show()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_cat.csv")
>>> prostate[0] = prostate[0].asnumeric()
>>> prostate[4] = prostate[4].asnumeric()
>>> pros_glrm = H2OGeneralizedLowRankEstimator(k=5,
...                                            seed=1234)
>>> pros_glrm.train(x=prostate.names, training_frame=prostate)
>>> pros_glrm.show()
property transform

Transformation of training data

One of: "none", "standardize", "normalize", "demean", "descale" (default: "none").

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_cat.csv")
>>> prostate[0] = prostate[0].asnumeric()
>>> prostate[4] = prostate[4].asnumeric()
>>> pros_glrm = H2OGeneralizedLowRankEstimator(k=5,
...                                            score_each_iteration=True,
...                                            transform="standardize",
...                                            seed=12345)
>>> pros_glrm.train(x=prostate.names, training_frame=prostate)
>>> pros_glrm.show()
property user_x

User-specified initial X

Type: H2OFrame.

Examples

>>> arrestsH2O = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> initial_x = ([[5.412, 65.24, -7.54, -0.032, 2.212, 92.24, -17.54, 23.268, 0.312,
...                123.24, 14.46, 9.768, 1.012, 19.24, -15.54, -1.732, 5.412, 65.24,
...                -7.54, -0.032, 2.212, 92.24, -17.54, 23.268, 0.312, 123.24, 14.46,
...                9.76, 1.012, 19.24, -15.54, -1.732, 5.412, 65.24, -7.54, -0.032,
...                2.212, 92.24, -17.54, 23.268, 0.312, 123.24, 14.46, 9.768, 1.012,
...                19.24, -15.54, -1.732, 5.412, 65.24]]*4)
>>> initial_x_h2o = h2o.H2OFrame(list(zip(*initial_x)))
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=4,
...                                               transform="demean",
...                                               loss="quadratic",
...                                               gamma_x=0.5,
...                                               gamma_y=0.3,
...                                               init="user",
...                                               user_x=initial_x_h2o,
...                                               recover_svd=True)
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property user_y

User-specified initial Y

Type: H2OFrame.

Examples

>>> arrestsH2O = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> initial_y = [[5.412,  65.24,  -7.54, -0.032],
...              [2.212,  92.24, -17.54, 23.268],
...              [0.312, 123.24,  14.46,  9.768],
...              [1.012,  19.24, -15.54, -1.732]]
>>> initial_y_h2o = h2o.H2OFrame(list(zip(*initial_y)))
>>> arrests_glrm = H2OGeneralizedLowRankEstimator(k=4,
...                                               transform="demean",
...                                               loss="quadratic",
...                                               gamma_x=0.5,
...                                               gamma_y=0.3,
...                                               init="user",
...                                               user_y=initial_y_h2o,
...                                               recover_svd=True)
>>> arrests_glrm.train(x=arrestsH2O.names, training_frame=arrestsH2O)
>>> arrests_glrm.show()
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> iris = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_wheader.csv")
>>> iris_glrm = H2OGeneralizedLowRankEstimator(k=3,
...                                            loss="quadratic",
...                                            gamma_x=0.5,
...                                            gamma_y=0.5,
...                                            transform="standardize")
>>> iris_glrm.train(x=iris.names,
...                 training_frame=iris,
...                 validation_frame=iris)
>>> iris_glrm.show()

H2OIsolationForestEstimator

class h2o.estimators.isolation_forest.H2OIsolationForestEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Isolation Forest

Builds an Isolation Forest model. Isolation Forest algorithm samples the training frame and in each iteration builds a tree that partitions the space of the sample observations until it isolates each observation. Length of the path from root to a leaf node of the resulting tree is used to calculate the anomaly score. Anomalies are easier to isolate and their average tree path is expected to be shorter than paths of regular observations.

property build_tree_one_node

Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars_if = H2OIsolationForestEstimator(build_tree_one_node=True,
...                                       seed=1234)
>>> cars_if.train(x=predictors,
...               training_frame=cars)
>>> cars_if.model_performance()
property categorical_encoding

Encoding scheme for categorical features

One of: "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> encoding = "one_hot_explicit"
>>> airlines_if = H2OIsolationForestEstimator(categorical_encoding=encoding,
...                                           seed=1234)
>>> airlines_if.train(x=predictors,
...                   training_frame=airlines)
>>> airlines_if.model_performance()
property col_sample_rate_change_per_level

Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> airlines_if = H2OIsolationForestEstimator(col_sample_rate_change_per_level=.9,
...                                           seed=1234)
>>> airlines_if.train(x=predictors,
...                   training_frame=airlines)
>>> airlines_if.model_performance()
property col_sample_rate_per_tree

Column sample rate per tree (from 0.0 to 1.0)

Type: float (default: 1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> airlines_if = H2OIsolationForestEstimator(col_sample_rate_per_tree=.7,
...                                           seed=1234)
>>> airlines_if.train(x=predictors,
...                   training_frame=airlines)
>>> airlines_if.model_performance()
property contamination

Contamination ratio - the proportion of anomalies in the input dataset. If undefined (-1) the predict function will not mark observations as anomalies and only anomaly score will be returned. Defaults to -1 (undefined).

Type: float (default: -1).

property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_if = H2OIsolationForestEstimator(max_depth=3,
...                                      seed=1234,
...                                      export_checkpoints_dir=checkpoints_dir)
>>> air_if.train(x=predictors,
...              training_frame=airlines)
>>> len(listdir(checkpoints_dir))
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_if = H2OIsolationForestEstimator(seed=1234,
...                                       ignore_const_cols=True)
>>> cars_if.train(x=predictors,
...               training_frame=cars)
>>> cars_if.model_performance()
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property max_depth

Maximum tree depth (0 for unlimited).

Type: int (default: 8).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars_if = H2OIsolationForestEstimator(max_depth=2,
...                                       seed=1234)
>>> cars_if.train(x=predictors,
...               training_frame=cars)
>>> cars_if.model_performance()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars_if = H2OIsolationForestEstimator(max_runtime_secs=10,
...                                       ntrees=10000,
...                                       max_depth=10,
...                                       seed=1234)
>>> cars_if.train(x=predictors,
...               training_frame=cars)
>>> cars_if.model_performance()
property min_rows

Fewest allowed (weighted) observations in a leaf.

Type: float (default: 1).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars_if = H2OIsolationForestEstimator(min_rows=16,
...                                       seed=1234)
>>> cars_if.train(x=predictors,
...               training_frame=cars)
>>> cars_if.model_performance()
property mtries

Number of variables randomly sampled as candidates at each split. If set to -1, defaults (number of predictors)/3.

Type: int (default: -1).

Examples

>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> predictors = covtype.columns[0:54]
>>> cov_if = H2OIsolationForestEstimator(mtries=30, seed=1234)
>>> cov_if.train(x=predictors,
...              training_frame=covtype)
>>> cov_if.model_performance()
property ntrees

Number of trees.

Type: int (default: 50).

Examples

>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> predictors = titanic.columns
>>> tree_num = [20, 50, 80, 110, 140, 170, 200]
>>> label = ["20", "50", "80", "110", "140", "170", "200"]
>>> for key, num in enumerate(tree_num):
...     titanic_if = H2OIsolationForestEstimator(ntrees=num,
...                                              seed=1234)
...     titanic_if.train(x=predictors,
...                      training_frame=titanic) 
...     print(label[key], 'training score', titanic_if.mse(train=True))
property sample_rate

Rate of randomly sampled observations used to train each Isolation Forest tree. Needs to be in range from 0.0 to 1.0. If set to -1, sample_rate is disabled and sample_size will be used instead.

Type: float (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> airlines_if = H2OIsolationForestEstimator(sample_rate=.7,
...                                           seed=1234)
>>> airlines_if.train(x=predictors,
...                   training_frame=airlines)
>>> airlines_if.model_performance()
property sample_size

Number of randomly sampled observations used to train each Isolation Forest tree. Only one of parameters sample_size and sample_rate should be defined. If sample_rate is defined, sample_size will be ignored.

Type: int (default: 256).

Examples

>>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_train.csv")
>>> test = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_test.csv")
>>> isofor_model = H2OIsolationForestEstimator(sample_size=5,
...                                            ntrees=7)
>>> isofor_model.train(training_frame=train)
>>> isofor_model.model_performance()
>>> isofor_model.model_performance(test)
property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars_if = H2OIsolationForestEstimator(score_each_iteration=True,
...                                       ntrees=55,
...                                       seed=1234)
>>> cars_if.train(x=predictors,
...               training_frame=cars)
>>> cars_if.model_performance()
property score_tree_interval

Score the model after every so many trees. Disabled if set to 0.

Type: int (default: 0).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars_if = H2OIsolationForestEstimator(score_tree_interval=5,
...                                       seed=1234)
>>> cars_if.train(x=predictors,
...               training_frame=cars)
>>> cars_if.model_performance()
property seed

Seed for pseudo random number generator (if applicable)

Type: int (default: -1).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> isofor_w_seed = H2OIsolationForestEstimator(seed=1234) 
>>> isofor_w_seed.train(x=predictors,
...                     training_frame=airlines)
>>> isofor_wo_seed = H2OIsolationForestEstimator()
>>> isofor_wo_seed.train(x=predictors,
...                      training_frame=airlines)
>>> isofor_w_seed.model_performance()
>>> isofor_wo_seed.model_performance()
property stopping_metric

Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client.

One of: "auto", "anomaly_score", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "misclassification", "mean_per_class_error" (default: "auto").

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> airlines_if = H2OIsolationForestEstimator(stopping_metric="auto",
...                                           stopping_rounds=3,
...                                           stopping_tolerance=1e-2,
...                                           seed=1234)
>>> airlines_if.train(x=predictors,
...                   training_frame=airlines)
>>> airlines_if.model_performance()
property stopping_rounds

Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)

Type: int (default: 0).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> airlines_if = H2OIsolationForestEstimator(stopping_metric="auto",
...                                           stopping_rounds=3,
...                                           stopping_tolerance=1e-2,
...                                           seed=1234)
>>> airlines_if.train(x=predictors,
...                   training_frame=airlines)
>>> airlines_if.model_performance()
property stopping_tolerance

Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)

Type: float (default: 0.01).

Examples

>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
...               "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> airlines_if = H2OIsolationForestEstimator(stopping_metric="auto",
...                                           stopping_rounds=3,
...                                           stopping_tolerance=1e-2,
...                                           seed=1234)
>>> airlines_if.train(x=predictors,
...                   training_frame=airlines)
>>> airlines_if.model_performance()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars_if = H2OIsolationForestEstimator(seed=1234)
>>> cars_if.train(x=predictors,
...               training_frame=cars)
>>> cars_if.model_performance()
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

property validation_response_column

(experimental) Name of the response column in the validation frame. Response column should be binary and indicate not anomaly/anomaly.

Type: str.

H2OKMeansEstimator

class h2o.estimators.kmeans.H2OKMeansEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

K-means

Performs k-means clustering on an H2O dataset.

property categorical_encoding

Encoding scheme for categorical features

One of: "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited" (default: "auto").

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> encoding = "one_hot_explicit"
>>> pros_km = H2OKMeansEstimator(categorical_encoding=encoding,
...                              seed=1234)
>>> pros_km.train(x=predictors,
...               training_frame=train,
...               validation_frame=valid)
>>> pros_km.scoring_history()
property cluster_size_constraints

An array specifying the minimum number of points that should be in each cluster. The length of the constraints array has to be the same as the number of clusters.

Type: List[int].

Examples

>>> iris_h2o = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv")
>>> k=3
>>> start_points = h2o.H2OFrame(
...         [[4.9, 3.0, 1.4, 0.2],
...          [5.6, 2.5, 3.9, 1.1],
...          [6.5, 3.0, 5.2, 2.0]])
>>> kmm = H2OKMeansEstimator(k=k,
...                          user_points=start_points,
...                          standardize=True,
...                          cluster_size_constraints=[2, 5, 8],
...                          score_each_iteration=True)
>>> kmm.train(x=list(range(7)), training_frame=iris_h2o)
>>> kmm.scoring_history()
property estimate_k

Whether to estimate the number of clusters (<=k) iteratively and deterministically.

Type: bool (default: False).

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris['class'] = iris['class'].asfactor()
>>> predictors = iris.columns[:-1]
>>> train, valid = iris.split_frame(ratios=[.8], seed=1234)
>>> iris_kmeans = H2OKMeansEstimator(k=10,
...                                  estimate_k=True,
...                                  standardize=False,
...                                  seed=1234)
>>> iris_kmeans.train(x=predictors,
...                   training_frame=train,
...                   validation_frame=valid)
>>> iris_kmeans.scoring_history()
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_km = H2OKMeansEstimator(export_checkpoints_dir=checkpoints_dir,
...                             seed=1234)
>>> air_km.train(x=predictors, training_frame=airlines)
>>> len(listdir(checkpoints_dir))
property fold_assignment

Cross-validation fold assignment scheme, if fold_column is not specified. The ‘Stratified’ option will stratify the folds based on the response variable, for classification problems.

One of: "auto", "random", "modulo", "stratified" (default: "auto").

Examples

>>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
>>> predictors = ["radiation","temperature","wind"]
>>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
>>> ozone_km = H2OKMeansEstimator(fold_assignment="Random",
...                               nfolds=5,
...                               seed=1234)
>>> ozone_km.train(x=predictors,
...                training_frame=train,
...                validation_frame=valid)
>>> ozone_km.scoring_history()
property fold_column

Column with cross-validation fold index assignment per observation.

Type: str.

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> print(cars['fold_numbers'])
>>> cars_km = H2OKMeansEstimator(seed=1234)
>>> cars_km.train(x=predictors,
...               training_frame=cars,
...               fold_column="fold_numbers")
>>> cars_km.scoring_history()
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_km = H2OKMeansEstimator(ignore_const_cols=True,
...                              seed=1234)
>>> cars_km.train(x=predictors,
...               training_frame=train,
...               validation_frame=valid)
>>> cars_km.scoring_history()
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property init

Initialization mode

One of: "random", "plus_plus", "furthest", "user" (default: "furthest").

Examples

>>> seeds = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/seeds_dataset.txt")
>>> predictors = seeds.columns[0:7]
>>> train, valid = seeds.split_frame(ratios=[.8], seed=1234)
>>> seeds_km = H2OKMeansEstimator(k=3,
...                               init='Furthest',
...                               seed=1234)
>>> seeds_km.train(x=predictors,
...                training_frame=train,
...                validation_frame= valid)
>>> seeds_km.scoring_history()
property k

The max. number of clusters. If estimate_k is disabled, the model will find k centroids, otherwise it will find up to k centroids.

Type: int (default: 1).

Examples

>>> seeds = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/seeds_dataset.txt")
>>> predictors = seeds.columns[0:7]
>>> train, valid = seeds.split_frame(ratios=[.8], seed=1234)
>>> seeds_km = H2OKMeansEstimator(k=3, seed=1234)
>>> seeds_km.train(x=predictors,
...                training_frame=train,
...                validation_frame=valid)
>>> seeds_km.scoring_history()
property keep_cross_validation_fold_assignment

Whether to keep the cross-validation fold assignment.

Type: bool (default: False).

Examples

>>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
>>> predictors = ["radiation","temperature","wind"]
>>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
>>> ozone_km = H2OKMeansEstimator(keep_cross_validation_fold_assignment=True,
...                               nfolds=5,
...                               seed=1234)
>>> ozone_km.train(x=predictors,
...                training_frame=train)
>>> ozone_km.scoring_history()
property keep_cross_validation_models

Whether to keep the cross-validation models.

Type: bool (default: True).

Examples

>>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
>>> predictors = ["radiation","temperature","wind"]
>>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
>>> ozone_km = H2OKMeansEstimator(keep_cross_validation_models=True,
...                               nfolds=5,
...                               seed=1234)
>>> ozone_km.train(x=predictors,
...                training_frame=train,
...                validation_frame=valid)
>>> ozone_km.scoring_history()
property keep_cross_validation_predictions

Whether to keep the predictions of the cross-validation models.

Type: bool (default: False).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
...               "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> pros_km = H2OKMeansEstimator(keep_cross_validation_predictions=True,
...                              nfolds=5,
...                              seed=1234)
>>> pros_km.train(x=predictors,
...               training_frame=train,
...               validation_frame=valid)
>>> pros_km.scoring_history()
property max_iterations

Maximum training iterations (if estimate_k is enabled, then this is for each inner Lloyds iteration)

Type: int (default: 10).

Examples

>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
...               "AGP1","AGMN","LIV","AGLP"]
>>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
>>> benign_km = H2OKMeansEstimator(max_iterations=50)
>>> benign_km.train(x=predictors,
...                 training_frame=train,
...                 validation_frame=valid)
>>> benign_km.scoring_history()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
...               "AGP1","AGMN","LIV","AGLP"]
>>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
>>> benign_km = H2OKMeansEstimator(max_runtime_secs=10,
...                                seed=1234)
>>> benign_km.train(x=predictors,
...                 training_frame=train,
...                 validation_frame=valid)
>>> benign_km.scoring_history()
property nfolds

Number of folds for K-fold cross-validation (0 to disable or >= 2).

Type: int (default: 0).

Examples

>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
...               "AGP1","AGMN","LIV","AGLP"]
>>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
>>> benign_km = H2OKMeansEstimator(nfolds=5, seed=1234)
>>> benign_km.train(x=predictors,
...                 training_frame=train,
...                 validation_frame=valid)
>>> benign_km.scoring_history()
property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
...               "AGP1","AGMN","LIV","AGLP"]
>>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
>>> benign_km = H2OKMeansEstimator(score_each_iteration=True,
...                                seed=1234)
>>> benign_km.train(x=predictors,
...                 training_frame=train,
...                 validation_frame=valid)
>>> benign_km.scoring_history()
property seed

RNG Seed

Type: int (default: -1).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> pros_w_seed = H2OKMeansEstimator(seed=1234)
>>> pros_w_seed.train(x=predictors,
...                   training_frame=train,
...                   validation_frame=valid)
>>> pros_wo_seed = H2OKMeansEstimator()
>>> pros_wo_seed.train(x=predictors,
...                    training_frame=train,
...                    validation_frame=valid)
>>> pros_w_seed.scoring_history()
>>> pros_wo_seed.scoring_history()
property standardize

Standardize columns before computing distances

Type: bool (default: True).

Examples

>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_km = H2OKMeansEstimator(standardize=True)
>>> boston_km.train(x=predictors,
...                 training_frame=train,
...                 validation_frame=valid)
>>> boston_km.scoring_history()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
...               "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> pros_km = H2OKMeansEstimator(seed=1234)
>>> pros_km.train(x=predictors,
...               training_frame=train,
...               validation_frame=valid)
>>> pros_km.scoring_history()
property user_points

This option allows you to specify a dataframe, where each row represents an initial cluster center. The user- specified points must have the same number of columns as the training observations. The number of rows must equal the number of clusters

Type: H2OFrame.

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris['class'] = iris['class'].asfactor()
>>> predictors = iris.columns[:-1]
>>> train, valid = iris.split_frame(ratios=[.8], seed=1234)
>>> point1 = [4.9,3.0,1.4,0.2]
>>> point2 = [5.6,2.5,3.9,1.1]
>>> point3 = [6.5,3.0,5.2,2.0]
>>> points = h2o.H2OFrame([point1, point2, point3])
>>> iris_km = H2OKMeansEstimator(k=3,
...                              user_points=points,
...                              seed=1234)
>>> iris_km.train(x=predictors,
...               training_frame=iris,
...               validation_frame=valid)
>>> iris_kmeans.tot_withinss(valid=True)
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
...               "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> pros_km = H2OKMeansEstimator(seed=1234)
>>> pros_km.train(x=predictors,
...               training_frame=train,
...               validation_frame=valid)
>>> pros_km.scoring_history()

H2OPrincipalComponentAnalysisEstimator

class h2o.estimators.pca.H2OPrincipalComponentAnalysisEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Principal Components Analysis

property compute_metrics

Whether to compute metrics on the training data

Type: bool (default: True).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
>>> prostate['RACE'] = prostate['RACE'].asfactor()
>>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
>>> prostate['DPROS'] = prostate['DPROS'].asfactor()
>>> pros_pca = H2OPrincipalComponentAnalysisEstimator(compute_metrics=False)
>>> pros_pca.train(x=prostate.names, training_frame=prostate)
>>> pros_pca.show()
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
>>> prostate['RACE'] = prostate['RACE'].asfactor()
>>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
>>> prostate['DPROS'] = prostate['DPROS'].asfactor()
>>> checkpoints_dir = tempfile.mkdtemp()
>>> pros_pca = H2OPrincipalComponentAnalysisEstimator(impute_missing=True,
...                                                   export_checkpoints_dir=checkpoints_dir)
>>> pros_pca.train(x=prostate.names, training_frame=prostate)
>>> len(listdir(checkpoints_dir))
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
>>> prostate['RACE'] = prostate['RACE'].asfactor()
>>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
>>> prostate['DPROS'] = prostate['DPROS'].asfactor()
>>> pros_pca = H2OPrincipalComponentAnalysisEstimator(ignore_const_cols=False)
>>> pros_pca.train(x=prostate.names, training_frame=prostate)
>>> pros_pca.show()
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

property impute_missing

Whether to impute missing entries with the column mean

Type: bool (default: False).

Examples

>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
>>> prostate['RACE'] = prostate['RACE'].asfactor()
>>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
>>> prostate['DPROS'] = prostate['DPROS'].asfactor()
>>> pros_pca = H2OPrincipalComponentAnalysisEstimator(impute_missing=True)
>>> pros_pca.train(x=prostate.names, training_frame=prostate)
>>> pros_pca.show()
init_for_pipeline()[source]

Returns H2OPCA object which implements fit and transform method to be used in sklearn.Pipeline properly. All parameters defined in self.__params, should be input parameters in H2OPCA.__init__ method.

Returns

H2OPCA object

Examples

>>> from sklearn.pipeline import Pipeline
>>> from h2o.transforms.preprocessing import H2OScaler
>>> from h2o.estimators import H2ORandomForestEstimator
>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> pipe = Pipeline([("standardize", H2OScaler()),
...                  ("pca", H2OPrincipalComponentAnalysisEstimator(k=2).init_for_pipeline()),
...                  ("rf", H2ORandomForestEstimator(seed=42,ntrees=5))])
>>> pipe.fit(iris[:4], iris[4])
property k

Rank of matrix approximation

Type: int (default: 1).

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
...                                                   transform="standardize",
...                                                   pca_method="power",
...                                                   impute_missing=True,
...                                                   max_iterations=800)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property max_iterations

Maximum training iterations

Type: int (default: 1000).

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
...                                                   transform="standardize",
...                                                   pca_method="power",
...                                                   impute_missing=True,
...                                                   max_iterations=800)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
...                                                   transform="standardize",
...                                                   pca_method="power",
...                                                   impute_missing=True,
...                                                   max_iterations=800
...                                                   max_runtime_secs=15)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property pca_impl

Specify the implementation to use for computing PCA (via SVD or EVD): MTJ_EVD_DENSEMATRIX - eigenvalue decompositions for dense matrix using MTJ; MTJ_EVD_SYMMMATRIX - eigenvalue decompositions for symmetric matrix using MTJ; MTJ_SVD_DENSEMATRIX - singular-value decompositions for dense matrix using MTJ; JAMA - eigenvalue decompositions for dense matrix using JAMA. References: JAMA - http://math.nist.gov/javanumerics/jama/; MTJ - https://github.com/fommil/matrix-toolkits-java/

One of: "mtj_evd_densematrix", "mtj_evd_symmmatrix", "mtj_svd_densematrix", "jama".

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=3,
...                                                   pca_impl="jama",
...                                                   impute_missing=True,
...                                                   max_iterations=1200)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property pca_method

Specify the algorithm to use for computing the principal components: GramSVD - uses a distributed computation of the Gram matrix, followed by a local SVD; Power - computes the SVD using the power iteration method (experimental); Randomized - uses randomized subspace iteration method; GLRM - fits a generalized low-rank model with L2 loss function and no regularization and solves for the SVD using local matrix algebra (experimental)

One of: "gram_s_v_d", "power", "randomized", "glrm" (default: "gram_s_v_d").

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
...                                                   transform="standardize",
...                                                   pca_method="power",
...                                                   impute_missing=True,
...                                                   max_iterations=800)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=3,
...                                                   score_each_iteration=True,
...                                                   seed=1234,
...                                                   impute_missing=True)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property seed

RNG seed for initialization

Type: int (default: -1).

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=3,
...                                                   seed=1234,
...                                                   impute_missing=True)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator()
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property transform

Transformation of training data

One of: "none", "standardize", "normalize", "demean", "descale" (default: "none").

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
...                                                   transform="standardize",
...                                                   pca_method="power",
...                                                   impute_missing=True,
...                                                   max_iterations=800)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property use_all_factor_levels

Whether first factor level is included in each categorical expansion

Type: bool (default: False).

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=3,
...                                                   use_all_factor_levels=True,
...                                                   seed=1234)
>>> data_pca.train(x=data.names, training_frame=data)
>>> data_pca.show()
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
>>> train, valid = data.split_frame(ratios=[.8], seed=1234)
>>> model_pca = H2OPrincipalComponentAnalysisEstimator(impute_missing=True)
>>> model_pca.train(x=data.names,
...                training_frame=train,
...                validation_frame=valid)
>>> model_pca.show()

Miscellaneous

automl

h2o.automl.get_automl(project_name)[source]

Retrieve information about an AutoML instance.

Parameters

project_name (str) – A string indicating the project_name of the automl instance to retrieve.

Returns

A dictionary containing the project_name, leader model, leaderboard, event_log.

h2o.automl.get_leaderboard(aml, extra_columns=None)[source]

Retrieve the leaderboard from the AutoML instance. Contrary to the default leaderboard attached to the automl instance, this one can return columns other than the metrics. :param H2OAutoML aml: the instance for which to return the leaderboard. :param extra_columns: a string or a list of string specifying which optional columns should be added to the leaderboard. Defaults to None.

Currently supported extensions are: - ‘ALL’: adds all columns below. - ‘training_time_ms’: column providing the training time of each model in milliseconds (doesn’t include the training of cross validation models). - ‘predict_time_per_row_ms`: column providing the average prediction time by the model for a single row.

Returns

An H2OFrame representing the leaderboard.

Examples

>>> aml = H2OAutoML(max_runtime_secs=30)
>>> aml.train(y=y, training_frame=train)
>>> lb_all = h2o.automl.get_leaderboard(aml, 'ALL')
>>> lb_custom = h2o.automl.get_leaderboard(aml, ['predict_time_per_row_ms', 'training_time_ms'])
>>> lb_custom_sorted = lb_custom.sort(by='predict_time_per_row_ms')

H2OAutoML

class h2o.automl.H2OAutoML(nfolds=5, balance_classes=False, class_sampling_factors=None, max_after_balance_size=5.0, max_runtime_secs=None, max_runtime_secs_per_model=None, max_models=None, stopping_metric='AUTO', stopping_tolerance=None, stopping_rounds=3, seed=None, project_name=None, exclude_algos=None, include_algos=None, exploitation_ratio=0, modeling_plan=None, monotone_constraints=None, algo_parameters=None, keep_cross_validation_predictions=False, keep_cross_validation_models=False, keep_cross_validation_fold_assignment=False, sort_metric='AUTO', export_checkpoints_dir=None, verbosity='warn')[source]

Bases: h2o.automl._base.H2OAutoMLBaseMixin, h2o.base.Keyed

Automatic Machine Learning

The Automatic Machine Learning (AutoML) function automates the supervised machine learning model training process. The current version of AutoML trains and cross-validates a Random Forest (DRF), an Extremely-Randomized Forest (DRF/XRT), a random grid of Generalized Linear Models (GLM) a random grid of XGBoost (XGBoost), a random grid of Gradient Boosting Machines (GBM), a random grid of Deep Neural Nets (DeepLearning), and 2 Stacked Ensembles, one of all the models, and one of only the best models of each kind.

Examples

>>> import h2o
>>> from h2o.automl import H2OAutoML
>>> h2o.init()
>>> # Import a sample binary outcome train/test set into H2O
>>> train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
>>> test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
>>> # Identify the response and set of predictors
>>> y = "response"
>>> x = list(train.columns)  #if x is defined as all columns except the response, then x is not required
>>> x.remove(y)
>>> # For binary classification, response should be a factor
>>> train[y] = train[y].asfactor()
>>> test[y] = test[y].asfactor()
>>> # Run AutoML for 30 seconds
>>> aml = H2OAutoML(max_runtime_secs = 30)
>>> aml.train(x = x, y = y, training_frame = train)
>>> # Print Leaderboard (ranked by xval metrics)
>>> aml.leaderboard
>>> # (Optional) Evaluate performance on a test set
>>> perf = aml.leader.model_performance(test)
>>> perf.auc()
detach()[source]

Detach the Python object from the backend, usually by clearing its key

property event_log

Retrieve the backend event log from an H2OAutoML object

Returns

an H2OFrame with detailed events occurred during the AutoML training.

property key
Returns

the unique key representing the object on the backend

property leader

Retrieve the top model from an H2OAutoML object

Returns

an H2O model

Examples

>>> # Set up an H2OAutoML object
>>> aml = H2OAutoML(max_runtime_secs=30)
>>> # Launch an AutoML run
>>> aml.train(y=y, training_frame=train)
>>> # Get the best model in the AutoML Leaderboard
>>> aml.leader
>>>
>>> # Get AutoML object by `project_name`
>>> get_aml = h2o.automl.get_automl(aml.project_name)
>>> # Get the best model in the AutoML Leaderboard
>>> get_aml.leader
property leaderboard

Retrieve the leaderboard from an H2OAutoML object

Returns

an H2OFrame with model ids in the first column and evaluation metric in the second column sorted by the evaluation metric

Examples

>>> # Set up an H2OAutoML object
>>> aml = H2OAutoML(max_runtime_secs=30)
>>> # Launch an AutoML run
>>> aml.train(y=y, training_frame=train)
>>> # Get the AutoML Leaderboard
>>> aml.leaderboard
>>>
>>> # Get AutoML object by `project_name`
>>> get_aml = h2o.automl.get_automl(aml.project_name)
>>> # Get the AutoML Leaderboard
>>> get_aml.leaderboard
property modeling_steps

expose the modeling steps effectively used by the AutoML run. This executed plan can be directly reinjected as the modeling_plan property of a new AutoML instance

to improve reproducibility across AutoML versions.

Returns

a list of dictionaries representing the effective modeling plan.

predict(test_data)[source]

Predict on a dataset.

Parameters

test_data (H2OFrame) – Data on which to make predictions.

Returns

A new H2OFrame of predictions.

Examples

>>> # Set up an H2OAutoML object
>>> aml = H2OAutoML(max_runtime_secs=30)
>>> # Launch an H2OAutoML run
>>> aml.train(y=y, training_frame=train)
>>> # Predict with top model from AutoML Leaderboard on a H2OFrame called 'test'
>>> aml.predict(test)
>>>
>>> # Get AutoML object by `project_name`
>>> get_aml = h2o.automl.get_automl(aml.project_name)
>>> # Predict with top model from AutoML Leaderboard on a H2OFrame called 'test'
>>> get_aml.predict(test)
property project_name

Retrieve a string indicating the project_name of the automl instance to retrieve.

Returns

a string containing the project_name

train(x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None)[source]

Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O’s performance or prediction functions on the resulting models.

Parameters
  • x – A list of column names or indices indicating the predictor columns.

  • y – An index or a column name indicating the response column.

  • fold_column – The name or index of the column in training_frame that holds per-row fold assignments.

  • weights_column – The name or index of the column in training_frame that holds per-row weights.

  • training_frame – The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column).

  • validation_frame – H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.

  • leaderboard_frame – H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead.

  • blending_frame – H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting nfolds=0 and to provide a leaderboard frame for scoring purposes.

Returns

An H2OAutoML object.

Examples

>>> # Set up an H2OAutoML object
>>> aml = H2OAutoML(max_runtime_secs=30)
>>> # Launch an AutoML run
>>> aml.train(y=y, training_frame=train)
property training_info

Expose the name/value columns of event_log as a simple dictionary, for example start_epoch, stop_epoch, … See event_log() to obtain a description of those key/value pairs.

Returns

a dictionary with event_log[‘name’] column as keys and event_log[‘value’] column as values.

H2OEstimator

class h2o.estimators.estimator_base.H2OEstimator(*args, **kwargs)[source]

Bases: h2o.model.model_base.ModelBase

Base class for H2O Estimators.

H2O Estimators implement the following methods for model construction:

  • start() - Top-level user-facing API for asynchronous model build

  • join() - Top-level user-facing API for blocking on async model build

  • train() - Top-level user-facing API for model building.

  • fit() - Used by scikit-learn.

Because H2OEstimator instances are instances of ModelBase, these objects can use the H2O model API.

convert_H2OXGBoostParams_2_XGBoostParams()[source]

In order to use convert_H2OXGBoostParams_2_XGBoostParams and convert_H2OFrame_2_DMatrix, you must import the following toolboxes: xgboost, pandas, numpy and scipy.sparse.

Given an H2OXGBoost model, this method will generate the corresponding parameters that should be used by native XGBoost in order to give exactly the same result, assuming that the same dataset (derived from h2oFrame) is used to train the native XGBoost model.

Follow the steps below to compare H2OXGBoost and native XGBoost:

  1. Train the H2OXGBoost model with H2OFrame trainFile and generate a prediction:

  • h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # parameters specified as a dict()

  • h2oModelD.train(x=myX, y=y, training_frame=trainFile) # train with H2OFrame trainFile

  • h2oPredict = h2oPredictD = h2oModelD.predict(trainFile)

  1. Derive the DMatrix from H2OFrame:

  • nativeDMatrix = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD)

  1. Derive the parameters for native XGBoost:

  • nativeParams = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams()

  1. Train your native XGBoost model and generate a prediction:

  • nativeModel = xgb.train(params=nativeParams[0], dtrain=nativeDMatrix, num_boost_round=nativeParams[1])

  • nativePredict = nativeModel.predict(data=nativeDMatrix, ntree_limit=nativeParams[1]

  1. Compare the predictions h2oPredict from H2OXGBoost, nativePredict from native XGBoost.

Returns

nativeParams, num_boost_round

fit(X, y=None, **params)[source]

Fit an H2O model as part of a scikit-learn pipeline or grid search.

A warning will be issued if a caller other than sklearn attempts to use this method.

Parameters
  • X (H2OFrame) – An H2OFrame consisting of the predictor variables.

  • y (H2OFrame) – An H2OFrame consisting of the response variable.

  • params – Extra arguments.

Returns

The current instance of H2OEstimator for method chaining.

get_params(deep=True)[source]

Obtain parameters for this estimator.

Used primarily for sklearn Pipelines and sklearn grid search.

Parameters

deep – If True, return parameters of all sub-objects that are estimators.

Returns

A dict of parameters

join()[source]

Wait until job’s completion.

set_params(**parms)[source]

Used by sklearn for updating parameters during grid search.

Parameters

parms – A dictionary of parameters that will be set on this model.

Returns

self, the current estimator object with the parameters all set as desired.

start(x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params)[source]

Train the model asynchronously (to block for results call join()).

Parameters
  • x – A list of column names or indices indicating the predictor columns.

  • y – An index or a column name indicating the response column.

  • training_frame (H2OFrame) – The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights).

  • offset_column – The name or index of the column in training_frame that holds the offsets.

  • fold_column – The name or index of the column in training_frame that holds the per-row fold assignments.

  • weights_column – The name or index of the column in training_frame that holds the per-row weights.

  • validation_frame – H2OFrame with validation data to be scored on while training.

train(x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False)[source]

Train the H2O model.

Parameters
  • x – A list of column names or indices indicating the predictor columns.

  • y – An index or a column name indicating the response column.

  • training_frame (H2OFrame) – The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights).

  • offset_column – The name or index of the column in training_frame that holds the offsets.

  • fold_column – The name or index of the column in training_frame that holds the per-row fold assignments.

  • weights_column – The name or index of the column in training_frame that holds the per-row weights.

  • validation_frame – H2OFrame with validation data to be scored on while training.

  • max_runtime_secs (float) – Maximum allowed runtime in seconds for model training. Use 0 to disable.

  • verbose (bool) – Print scoring history to stdout. Defaults to False.

train_segments(x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, segments=None, segment_models_id=None, parallelism=1, verbose=False)[source]

Trains H2O model for each segment (subpopulation) of the training dataset.

Parameters
  • x – A list of column names or indices indicating the predictor columns.

  • y – An index or a column name indicating the response column.

  • training_frame (H2OFrame) – The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights).

  • offset_column – The name or index of the column in training_frame that holds the offsets.

  • fold_column – The name or index of the column in training_frame that holds the per-row fold assignments.

  • weights_column – The name or index of the column in training_frame that holds the per-row weights.

  • validation_frame – H2OFrame with validation data to be scored on while training.

  • max_runtime_secs (float) – Maximum allowed runtime in seconds for each model training. Use 0 to disable. Please note that regardless of how this parameter is set, a model will be built for each input segment. This parameter only affects individual model training.

  • segments – A list of columns to segment-by. H2O will group the training (and validation) dataset by the segment-by columns and train a separate model for each segment (group of rows). As an alternative to providing a list of columns, users can also supply an explicit enumeration of segments to build the models for. This enumeration needs to be represented as H2OFrame.

  • segment_models_id – Identifier for the returned collection of Segment Models. If not specified it will be automatically generated.

  • parallelism – Level of parallelism of the bulk segment models building, it is the maximum number of models each H2O node will be building in parallel.

  • verbose (bool) – Enable to print additional information during model building. Defaults to False.

Examples

>>> response = "survived"
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic[response] = titanic[response].asfactor()
>>> predictors = ["survived","name","sex","age","sibsp","parch","ticket","fare","cabin"]
>>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> titanic_gbm = H2OGradientBoostingEstimator(seed=1234)
>>> titanic_models = titanic_gbm.train_segments(segments=["pclass"],
...                                             x=predictors,
...                                             y=response,
...                                             training_frame=train,
...                                             validation_frame=valid)
>>> titanic_models.as_frame()

H2OSingularValueDecompositionEstimator

class h2o.estimators.svd.H2OSingularValueDecompositionEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Singular Value Decomposition

property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> checkpoints_dir = tempfile.mkdtemp()
>>> fit_h2o = H2OSingularValueDecompositionEstimator(export_checkpoints_dir=checkpoints_dir,
...                                                  seed=-5)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> len(listdir(checkpoints_dir))
property ignore_const_cols

Ignore constant columns.

Type: bool (default: True).

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(ignore_const_cols=False,
...                                                  nv=4)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property ignored_columns

Names of columns to ignore for training.

Type: List[str].

init_for_pipeline()[source]

Returns H2OSVD object which implements fit and transform method to be used in sklearn.Pipeline properly. All parameters defined in self.__params, should be input parameters in H2OSVD.__init__ method.

Returns

H2OSVD object

Examples

>>> from h2o.transforms.preprocessing import H2OScaler
>>> from h2o.estimators import H2ORandomForestEstimator
>>> from h2o.estimators import H2OSingularValueDecompositionEstimator
>>> from sklearn.pipeline import Pipeline
>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> pipe = Pipeline([("standardize", H2OScaler()),
...                  ("svd", H2OSingularValueDecompositionEstimator(nv=3).init_for_pipeline()),
...                  ("rf", H2ORandomForestEstimator(seed=42,ntrees=50))])
>>> pipe.fit(arrests[1:], arrests[0])
property keep_u

Save left singular vectors?

Type: bool (default: True).

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(keep_u=False)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property max_iterations

Maximum iterations

Type: int (default: 1000).

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(nv=4,
...                                                  transform="standardize",
...                                                  max_iterations=2000)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(nv=4,
...                                                  transform="standardize",
...                                                  max_runtime_secs=25)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property nv

Number of right singular vectors

Type: int (default: 1).

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(nv=4,
...                                                  transform="standardize",
...                                                  max_iterations=2000)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property score_each_iteration

Whether to score during each iteration of model training.

Type: bool (default: False).

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(nv=4,
...                                                  score_each_iteration=True)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property seed

RNG seed for k-means++ initialization

Type: int (default: -1).

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(nv=4, seed=-3)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property svd_method

Method for computing SVD (Caution: Randomized is currently experimental and unstable)

One of: "gram_s_v_d", "power", "randomized" (default: "gram_s_v_d").

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(svd_method="power")
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator()
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property transform

Transformation of training data

One of: "none", "standardize", "normalize", "demean", "descale" (default: "none").

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(nv=4,
...                                                  transform="standardize",
...                                                  max_iterations=2000)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property u_name

Frame key to save left singular vectors

Type: str.

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(u_name="fit_h2o")
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o.u_name
>>> fit_h2o
property use_all_factor_levels

Whether first factor level is included in each categorical expansion

Type: bool (default: True).

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> fit_h2o = H2OSingularValueDecompositionEstimator(use_all_factor_levels=False)
>>> fit_h2o.train(x=list(range(4)), training_frame=arrests)
>>> fit_h2o
property validation_frame

Id of the validation data frame.

Type: H2OFrame.

Examples

>>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")
>>> train, valid = arrests.split_frame(ratios=[.8])
>>> fit_h2o = H2OSingularValueDecompositionEstimator()
>>> fit_h2o.train(x=list(range(4)),
...               training_frame=train,
...               validation_frame=valid)
>>> fit_h2o

H2OWord2vecEstimator

class h2o.estimators.word2vec.H2OWord2vecEstimator(**kwargs)[source]

Bases: h2o.estimators.estimator_base.H2OEstimator

Word2Vec

property epochs

Number of training iterations to run

Type: int (default: 5).

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("teacher", count = 5)
>>> print(synonyms)
>>>
>>> w2v_model2 = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 1)
>>> w2v_model2.train(training_frame=words)
>>> synonyms2 = w2v_model2.find_synonyms("teacher", 3)
>>> print(synonyms2)
property export_checkpoints_dir

Automatically export generated models to this directory.

Type: str.

Examples

>>> import tempfile
>>> from os import listdir
>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> checkpoints_dir = tempfile.mkdtemp()
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=1,
...                                  max_runtime_secs=10,
...                                  export_checkpoints_dir=checkpoints_dir)
>>> w2v_model.train(training_frame=words)
>>> len(listdir(checkpoints_dir))
static from_external(external=<class 'h2o.frame.H2OFrame'>)[source]

Creates new H2OWord2vecEstimator based on an external model.

Parameters

external – H2OFrame with an external model

Returns

H2OWord2vecEstimator instance representing the external model

Examples

>>> words = h2o.create_frame(rows=10, cols=1,
...                          string_fraction=1.0,
...                          missing_fraction=0.0)
>>> embeddings = h2o.create_frame(rows=10, cols=100,
...                               real_fraction=1.0,
...                               missing_fraction=0.0)
>>> word_embeddings = words.cbind(embeddings)
>>> w2v_model = H2OWord2vecEstimator.from_external(external=word_embeddings)
property init_learning_rate

Set the starting learning rate

Type: float (default: 0.025).

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=3, init_learning_rate=0.05)
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("assistant", 3)
>>> print(synonyms)
property max_runtime_secs

Maximum allowed runtime in seconds for model training. Use 0 to disable.

Type: float (default: 0).

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=1, max_runtime_secs=10)
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("tutor", 3)
>>> print(synonyms)
property min_word_freq

This will discard words that appear less than <int> times

Type: int (default: 5).

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=1, min_word_freq=4)
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("teacher", 3)
>>> print(synonyms)
property norm_model

Use Hierarchical Softmax

One of: "hsm" (default: "hsm").

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=1, norm_model="hsm")
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("teacher", 3)
>>> print(synonyms)
property pre_trained

Id of a data frame that contains a pre-trained (external) word2vec model

Type: H2OFrame.

Examples

>>> words = h2o.create_frame(rows=1000,cols=1,
...                          string_fraction=1.0,
...                          missing_fraction=0.0)
>>> embeddings = h2o.create_frame(rows=1000,cols=100,
...                               real_fraction=1.0,
...                               missing_fraction=0.0)
>>> word_embeddings = words.cbind(embeddings)
>>> w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings)
>>> w2v_model.train(training_frame=word_embeddings)
>>> model_id = w2v_model.model_id
>>> model = h2o.get_model(model_id)
property sent_sample_rate

Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; useful range is (0, 1e-5)

Type: float (default: 0.001).

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=1, sent_sample_rate=0.01)
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("teacher", 3)
>>> print(synonyms)
property training_frame

Id of the training data frame.

Type: H2OFrame.

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator()
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("tutor", 3)
>>> print(synonyms)
property vec_size

Set size of word vectors

Type: int (default: 100).

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=3, vec_size=50)
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("tutor", 3)
>>> print(synonyms)
property window_size

Set max skip length between words

Type: int (default: 5).

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=3, window_size=2)
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("teacher", 3)
>>> print(synonyms)
property word_model

The word model to use (SkipGram or CBOW)

One of: "skip_gram", "cbow" (default: "skip_gram").

Examples

>>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
...                               col_names = ["category", "jobtitle"], 
...                               col_types = ["string", "string"], 
...                               header = 1)
>>> words = job_titles.tokenize(" ")
>>> w2v_model = H2OWord2vecEstimator(epochs=3, word_model="skip_gram")
>>> w2v_model.train(training_frame=words)
>>> synonyms = w2v_model.find_synonyms("assistant", 3)
>>> print(synonyms)

H2OGridSearch

class h2o.grid.H2OGridSearch(*args, **kwargs)[source]

Bases: h2o.grid.grid_search.H2OGridSearch

Grid Search of a Hyper-Parameter Space for a Model

Parameters
  • model – The type of model to be explored initialized with optional parameters that will be unchanged across explored models.

  • hyper_params – A dictionary of string parameters (keys) and a list of values to be explored by grid search (values).

  • grid_id (str) – The unique id assigned to the resulting grid object. If none is given, an id will automatically be generated.

  • search_criteria

    The optional dictionary of directives which control the search of the hyperparameter space. The dictionary can include values for: strategy, max_models, max_runtime_secs, stopping_metric, stopping_tolerance, stopping_rounds and seed. The default strategy, “Cartesian”, covers the entire space of hyperparameter combinations. If you want to use cartesian grid search, you can leave the search_criteria argument unspecified. Specify the “RandomDiscrete” strategy to get random search of all the combinations of your hyperparameters with three ways of specifying when to stop the search: max number of models, max time, and metric-based early stopping (e.g., stop if MSE hasn’t improved by 0.0001 over the 5 best models). Examples below:

    >>> criteria = {"strategy": "RandomDiscrete", "max_runtime_secs": 600,
    ...             "max_models": 100, "stopping_metric": "AUTO",
    ...             "stopping_tolerance": 0.00001, "stopping_rounds": 5,
    ...             "seed": 123456}
    >>> criteria = {"strategy": "RandomDiscrete", "max_models": 42,
    ...             "max_runtime_secs": 28800, "seed": 1234}
    >>> criteria = {"strategy": "RandomDiscrete", "stopping_metric": "AUTO",
    ...             "stopping_tolerance": 0.001, "stopping_rounds": 10}
    >>> criteria = {"strategy": "RandomDiscrete", "stopping_rounds": 5,
    ...             "stopping_metric": "misclassification",
    ...             "stopping_tolerance": 0.00001}
    

  • parallelism – Level of parallelism during grid model building. 1 = sequential building (default). Use the value of 0 for adaptive parallelism - decided by H2O. Any number > 1 sets the exact number of models built in parallel.

Returns

a new H2OGridSearch instance

Examples

>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> hyper_parameters = {'alpha': [0.01,0.5], 'lambda': [1e-5,1e-6]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_parameters)
>>> training_data = h2o.import_file("smalldata/logreg/benign.csv")
>>> gs.train(x=[3, 4-11], y=3, training_frame=training_data)
>>> gs.show()
aic(train=False, valid=False, xval=False)[source]

Get the AIC(s).

If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are “train”, “valid”, and “xval”.

Parameters
  • train (bool) – If train is True, then return the AIC value for the training data.

  • valid (bool) – If valid is True, then return the AIC value for the validation data.

  • xval (bool) – If xval is True, then return the AIC value for the validation data.

Returns

The AIC.

Examples

>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate[2] = prostate[2].asfactor()
>>> prostate[4] = prostate[4].asfactor()
>>> prostate[5] = prostate[5].asfactor()
>>> prostate[8] = prostate[8].asfactor()
>>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
>>> response = "CAPSULE"
>>> hyper_params = {'alpha': [0.01,0.5],
...                 'lambda': [1e-5,1e-6]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=predictors, y=response, training_frame=prostate)
>>> gs.aic()
auc(train=False, valid=False, xval=False)[source]

Get the AUC(s).

If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are “train”, “valid”, and “xval”.

Parameters
  • train (bool) – If train is True, then return the AUC value for the training data.

  • valid (bool) – If valid is True, then return the AUC value for the validation data.

  • xval (bool) – If xval is True, then return the AUC value for the validation data.

Returns

The AUC.

Examples

>>> from h2o.estimators import H2OGradientBoostingEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> data = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
>>> test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
>>> x = data.columns
>>> y = "response"
>>> x.remove(y)
>>> data[y] = data[y].asfactor()
>>> test[y] = test[y].asfactor()
>>> ss = data.split_frame(seed = 1)
>>> train = ss[0]
>>> valid = ss[1]
>>> gbm_params1 = {'learn_rate': [0.01, 0.1],
...                 'max_depth': [3, 5, 9],
...                 'sample_rate': [0.8, 1.0],
...                 'col_sample_rate': [0.2, 0.5, 1.0]}
>>> gbm_grid1 = H2OGridSearch(model=H2OGradientBoostingEstimator,
...                           grid_id='gbm_grid1',
...                           hyper_params=gbm_params1)
>>> gbm_grid1.train(x=x, y=y,
...                 training_frame=train,
...                 validation_frame=valid,
...                 ntrees=100,
...                 seed=1)
>>> gbm_pridperf1 = gbm_grid1.get_grid(sort_by='auc', decreasing=True)
>>> best_gbm1 = gbm_gridperf1.models[0]
>>> best_gbm_perf1 = best_gbm1.model_performance(test)
>>> best_gbm_perf1.auc()
aucpr(train=False, valid=False, xval=False)[source]

Get the aucPR (Area Under PRECISION RECALL Curve).

If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are “train”, “valid”, and “xval”.

Parameters
  • train (bool) – If train is True, then return the aucpr value for the training data.

  • valid (bool) – If valid is True, then return the aucpr value for the validation data.

  • xval (bool) – If xval is True, then return the aucpr value for the validation data.

Returns

The AUCPR for the models in this grid.

biases(vector_id=0)[source]

Return the frame for the respective bias vector.

Parameters

vector_id – an integer, ranging from 0 to number of layers, that specifies the bias vector to return.

Returns

an H2OFrame which represents the bias vector identified by vector_id

Examples

>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv")
>>> hh = H2ODeepLearningEstimator(hidden=[],
...                               loss="CrossEntropy",
...                               export_weights_and_biases=True)
>>> hh.train(x=list(range(4)), y=4, training_frame=iris)
>>> hh.biases(0)
build_model(algo_params)[source]

(internal)

catoffsets()[source]

Categorical offsets for one-hot encoding

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv")
>>> hh = H2ODeepLearningEstimator(hidden=[],
...                               loss="CrossEntropy",
...                               export_weights_and_biases=True)
>>> hh.train(x=list(range(4)), y=4, training_frame=iris)
>>> hh.catoffsets()
coef()[source]

Return the coefficients that can be applied to the non-standardized data.

Note: standardize = True by default. If set to False, then coef() returns the coefficients that are fit directly.

Examples

>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> training_data = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/logreg/benign.csv")
>>> hyper_parameters = {'alpha': [0.01,0.5],
...                     'lambda': [1e-5,1e-6]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_parameters)
>>> gs.train(x=range(3)+range(4,11), y=3, training_frame=training_data)
>>> gs.coef()
coef_norm()[source]

Return coefficients fitted on the standardized data (requires standardize = True, which is on by default). These coefficients can be used to evaluate variable importance.

Examples

>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> training_data = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/logreg/benign.csv")
>>> hyper_parameters = {'alpha': [0.01,0.5],
...                     'lambda': [1e-5,1e-6]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_parameters)
>>> gs.train(x=range(3)+range(4,11), y=3, training_frame=training_data)
>>> gs.coef_norm()
deepfeatures(test_data, layer)[source]

Obtain a hidden layer’s details on a dataset.

Parameters
  • test_data – Data to create a feature space on.

  • layer (int) – Index of the hidden layer.

Returns

A dictionary of hidden layer details for each model.

Examples

>>> from h2o.estimators import H2OAutoEncoderEstimator
>>> resp = 784
>>> nfeatures = 20
>>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
>>> train[resp] = train[resp].asfactor()
>>> test = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
>>> test[resp] = test[resp].asfactor()
>>> sid = train[0].runif(0)
>>> train_unsup = train[sid >= 0.5]
>>> train_unsup.pop(resp)
>>> train_sup = train[sid < 0.5]
>>> ae_model = H2OAutoEncoderEstimator(activation="Tanh",
...                                    hidden=[nfeatures],
...                                    model_id="ae_model",
...                                    epochs=1,
...                                    ignore_const_cols=False,
...                                    reproducible=True,
...                                    seed=1234)
>>> ae_model.train(list(range(resp)), training_frame=train_unsup)
>>> ae_model.deepfeatures(train_sup[0:resp], 0)
detach()[source]

Detach the Python object from the backend, usually by clearing its key

property failed_params

Return a list of failed parameters. :examples:

>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> training_data = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/logreg/benign.csv")
>>> hyper_parameters = {'alpha': [0.01,0.5],
...                     'lambda': [1e-5,1e-6],
...                     'beta_epsilon': [0.05]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_parameters)
>>> gs.train(x=range(3)+range(4,11), y=3, training_frame=training_data)
>>> gs.failed_params
get_grid(sort_by=None, decreasing=None)[source]

Retrieve an H2OGridSearch instance.

Optionally specify a metric by which to sort models and a sort order. Note that if neither cross-validation nor a validation frame is used in the grid search, then the training metrics will display in the “get grid” output. If a validation frame is passed to the grid, and nfolds = 0, then the validation metrics will display. However, if nfolds > 1, then cross-validation metrics will display even if a validation frame is provided.

Parameters
  • sort_by (str) – A metric by which to sort the models in the grid space. Choices are: "logloss", "residual_deviance", "mse", "auc", "r2", "accuracy", "precision", "recall", "f1", etc.

  • decreasing (bool) – Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default).

Returns

A new H2OGridSearch instance optionally sorted on the specified metric.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.get_grid(sort_by='F1', decreasing=True)
get_hyperparams(id, display=True)[source]

Get the hyperparameters of a model explored by grid search.

Parameters
  • id (str) – The model id of the model with hyperparameters of interest.

  • display (bool) – Flag to indicate whether to display the hyperparameter names.

Returns

A list of the hyperparameters for the specified model.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> best_model_id = gs.get_grid(sort_by='F1',
...                             decreasing=True).model_ids[0]
>>> gs.get_hyperparams(best_model_id)
get_hyperparams_dict(id, display=True)[source]

Derived and returned the model parameters used to train the particular grid search model.

Parameters
  • id (str) – The model id of the model with hyperparameters of interest.

  • display (bool) – Flag to indicate whether to display the hyperparameter names.

Returns

A dict of model pararmeters derived from the hyper-parameters used to train this particular model.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> best_model_id = gs.get_grid(sort_by='F1',
...                             decreasing=True).model_ids[0]
>>> gs.get_hyperparams_dict(best_model_id)
get_xval_models(key=None)[source]

Return a Model object.

Parameters

key (str) – If None, return all cross-validated models; otherwise return the model specified by the key.

Returns

A model or a list of models.

Examples

>>> from h2o.estimators import H2OGradientBoostingEstimator
>>> fr = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate_train.csv")
>>> m = H2OGradientBoostingEstimator(nfolds=10,
...                                  ntrees=10,
...                                  keep_cross_validation_models=True)
>>> m.train(x=list(range(2,fr.ncol)), y=1, training_frame=fr)
>>> m.get_xval_models()
gini(train=False, valid=False, xval=False)[source]

Get the Gini Coefficient(s).

If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are “train”, “valid”, and “xval”.

Parameters
  • train (bool) – If train is True, then return the Gini Coefficient value for the training data.

  • valid (bool) – If valid is True, then return the Gini Coefficient value for the validation data.

  • xval (bool) – If xval is True, then return the Gini Coefficient value for the cross validation data.

Returns

The Gini Coefficient for the models in this grid.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.gini()
property grid_id

A key that identifies this grid search object in H2O.

Examples

>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> training_data = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/logreg/benign.csv")
>>> hyper_parameters = {'alpha': [0.01,0.5],
...                     'lambda': [1e-5,1e-6]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_parameters)
>>> gs.train(x=range(3)+range(4,11), y=3, training_frame=training_data)
>>> gs.grid_id
property hyper_names

Return the hyperparameter names.

Examples

>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> training_data = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/logreg/benign.csv")
>>> hyper_parameters = {'alpha': [0.01,0.5],
...                     'lambda': [1e-5,1e-6]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_parameters)
>>> gs.train(x=range(3)+range(4,11), y=3, training_frame=training_data)
>>> gs.hyper_names
is_cross_validated()[source]

Return True if the model was cross-validated.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.is_cross_validated()
join()[source]

Wait until grid finishes computing.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5), hyper_params)
>>> gs.start(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.join()
property key
Returns

the unique key representing the object on the backend

logloss(train=False, valid=False, xval=False)[source]

Get the Log Loss(s).

If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are “train”, “valid”, and “xval”.

Parameters
  • train (bool) – If train is True, then return the Log Loss value for the training data.

  • valid (bool) – If valid is True, then return the Log Loss value for the validation data.

  • xval (bool) – If xval is True, then return the Log Loss value for the cross validation data.

Returns

The Log Loss for this binomial model.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.logloss()
mean_residual_deviance(train=False, valid=False, xval=False)[source]

Get the Mean Residual Deviances(s).

If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are “train”, “valid”, and “xval”.

Parameters
  • train (bool) – If train is True, then return the Mean Residual Deviance value for the training data.

  • valid (bool) – If valid is True, then return the Mean Residual Deviance value for the validation data.

  • xval (bool) – If xval is True, then return the Mean Residual Deviance value for the cross validation data.

Returns

The Mean Residual Deviance for this regression model.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.mean_residual_deviance()
property model_ids

Returns model ids.

Examples

>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> training_data = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/logreg/benign.csv")
>>> hyper_parameters = {'alpha': [0.01,0.5],
...                     'lambda': [1e-5,1e-6]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_parameters)    
>>> gs.train(x=range(3)+range(4,11), y=3, training_frame=training_data)
>>> gs.model_ids
model_performance(test_data=None, train=False, valid=False, xval=False)[source]

Generate model metrics for this model on test_data.

Parameters
  • test_data – Data set for which model metrics shall be computed against. All three of train, valid and xval arguments are ignored if test_data is not None.

  • train – Report the training metrics for the model.

  • valid – Report the validation metrics for the model.

  • xval – Report the validation metrics for the model.

Returns

An object of class H2OModelMetrics.

Examples

>>> from h2o.estimators import H2OGradientBoostingEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> data = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
>>> test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
>>> x = data.columns
>>> y = "response"
>>> x.remove(y)
>>> data[y] = data[y].asfactor()
>>> test[y] = test[y].asfactor()
>>> ss = data.split_frame(seed = 1)
>>> train = ss[0]
>>> valid = ss[1]
>>> gbm_params1 = {'learn_rate': [0.01, 0.1],
...                 'max_depth': [3, 5, 9],
...                 'sample_rate': [0.8, 1.0],
...                 'col_sample_rate': [0.2, 0.5, 1.0]}
>>> gbm_grid1 = H2OGridSearch(model=H2OGradientBoostingEstimator,
...                           grid_id='gbm_grid1',
...                           hyper_params=gbm_params1)
>>> gbm_grid1.train(x=x, y=y,
...                 training_frame=train,
...                 validation_frame=valid,
...                 ntrees=100,
...                 seed=1)
>>> gbm_gridperf1 = gbm_grid1.get_grid(sort_by='auc', decreasing=True)
>>> best_gbm1 = gbm_gridperf1.models[0]
>>> best_gbm1.model_performance(test)
mse(train=False, valid=False, xval=False)[source]

Get the MSE(s).

If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are “train”, “valid”, and “xval”.

Parameters
  • train (bool) – If train is True, then return the MSE value for the training data.

  • valid (bool) – If valid is True, then return the MSE value for the validation data.

  • xval (bool) – If xval is True, then return the MSE value for the cross validation data.

Returns

The MSE for this regression model.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.mse()
normmul()[source]

Normalization/Standardization multipliers for numeric predictors.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.normmul()
normsub()[source]

Normalization/Standardization offsets for numeric predictors.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.normsub()
null_degrees_of_freedom(train=False, valid=False, xval=False)[source]

Retreive the null degress of freedom if this model has the attribute, or None otherwise.

Parameters
  • train (bool) – Get the null dof for the training set. If both train and valid are False, then train is selected by default.

  • valid (bool) – Get the null dof for the validation set. If both train and valid are True, then train is selected by default.

  • xval (bool) – Get the null dof for the cross-validated models.

Returns

the null dof, or None if it is not present.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.null_degrees_of_freedom()
null_deviance(train=False, valid=False, xval=False)[source]

Retreive the null deviance if this model has the attribute, or None otherwise.

Parameters
  • train (bool) – Get the null deviance for the training set. If both train and valid are False, then train is selected by default.

  • valid (bool) – Get the null deviance for the validation set. If both train and valid are True, then train is selected by default.

  • xval (bool) – Get the null deviance for the cross-validated models.

Returns

the null deviance, or None if it is not present.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.null_deviance()
pprint_coef()[source]

Pretty print the coefficents table (includes normalized coefficients).

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.pprint_coef()
pr_auc()[source]

H2OGridSearch.pr_auc is deprecated, please use H2OGridSearch.aucpr instead.

predict(test_data)[source]

Predict on a dataset.

Parameters

test_data (H2OFrame) – Data to be predicted on.

Returns

H2OFrame filled with predictions.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.predict(benign)
r2(train=False, valid=False, xval=False)[source]

Return the R^2 for this regression model.

The R^2 value is defined to be 1 - MSE/var, where var is computed as sigma^2.

If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are “train”, “valid”, and “xval”.

Parameters
  • train (bool) – If train is True, then return the R^2 value for the training data.

  • valid (bool) – If valid is True, then return the R^2 value for the validation data.

  • xval (bool) – If xval is True, then return the R^2 value for the cross validation data.

Returns

The R^2 for this regression model.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.r2()
residual_degrees_of_freedom(train=False, valid=False, xval=False)[source]

Retreive the residual degress of freedom if this model has the attribute, or None otherwise.

Parameters
  • train (bool) – Get the residual dof for the training set. If both train and valid are False, then train is selected by default.

  • valid (bool) – Get the residual dof for the validation set. If both train and valid are True, then train is selected by default.

  • xval (bool) – Get the residual dof for the cross-validated models.

Returns

the residual degrees of freedom, or None if they are not present.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.residual_degrees_of_freedom()
residual_deviance(train=False, valid=False, xval=False)[source]

Retreive the residual deviance if this model has the attribute, or None otherwise.

Parameters
  • train (bool) – Get the residual deviance for the training set. If both train and valid are False, then train is selected by default.

  • valid (bool) – Get the residual deviance for the validation set. If both train and valid are True, then train is selected by default.

  • xval (bool) – Get the residual deviance for the cross-validated models.

Returns

the residual deviance, or None if it is not present.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.residual_deviance()
respmul()[source]

Normalization/Standardization multipliers for numeric response.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.respmul()
respsub()[source]

Normalization/Standardization offsets for numeric response.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.respsub()
scoring_history()[source]

Retrieve model scoring history.

Returns

Score history (H2OTwoDimTable)

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.scoring_history()
show()[source]

Print models sorted by metric.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.show()
sort_by(metric, increasing=True)[source]

grid.sort_by() is deprecated; use grid.get_grid() instead

Deprecated since 2016-12-12, use grid.get_grid() instead.

sorted_metric_table()[source]

Retrieve summary table of an H2O Grid Search.

Returns

The summary table as an H2OTwoDimTable or a Pandas DataFrame.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.sorted_metric_table()
start(x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params)[source]

Asynchronous model build by specifying the predictor columns, response column, and any additional frame-specific values.

To block for results, call join().

Parameters
  • x – A list of column names or indices indicating the predictor columns.

  • y – An index or a column name indicating the response column.

  • training_frame – The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights).

  • offset_column – The name or index of the column in training_frame that holds the offsets.

  • fold_column – The name or index of the column in training_frame that holds the per-row fold assignments.

  • weights_column – The name or index of the column in training_frame that holds the per-row weights.

  • validation_frame – H2OFrame with validation data to be scored on while training.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5), hyper_params)
>>> gs.start(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.join()
summary(header=True)[source]

Print a detailed summary of the explored models.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.summary()
train(x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params)[source]

Train the model synchronously (i.e. do not return until the model finishes training).

To train asynchronously call start().

Parameters
  • x – A list of column names or indices indicating the predictor columns.

  • y – An index or a column name indicating the response column.

  • training_frame – The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights).

  • offset_column – The name or index of the column in training_frame that holds the offsets.

  • fold_column – The name or index of the column in training_frame that holds the per-row fold assignments.

  • weights_column – The name or index of the column in training_frame that holds the per-row weights.

  • validation_frame – H2OFrame with validation data to be scored on while training.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
varimp(use_pandas=False)[source]

Pretty print the variable importances, or return them in a list/pandas DataFrame.

Parameters

use_pandas (bool) – If True, then the variable importances will be returned as a pandas data frame.

Returns

A dictionary of lists or Pandas DataFrame instances.

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> insurance = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv")
>>> insurance["offset"] = insurance["Holders"].log()
>>> insurance["Group"] = insurance["Group"].asfactor()
>>> insurance["Age"] = insurance["Age"].asfactor()
>>> insurance["District"] = insurance["District"].asfactor()
>>> hyper_params = {'huber_alpha': [0.2,0.5],
...                 'quantile_alpha': [0.2,0.6]}
>>> from h2o.estimators import H2ODeepLearningEstimator
>>> gs = H2OGridSearch(H2ODeepLearningEstimator(epochs=5),
...                    hyper_params)
>>> gs.train(x=list(range(3)),y="Claims", training_frame=insurance)
>>> gs.varimp(use_pandas=True)
weights(matrix_id=0)[source]

Return the frame for the respective weight matrix.

Param

matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return.

Returns

an H2OFrame which represents the weight matrix identified by matrix_id

Examples

>>> from h2o.estimators import H2ODeepLearningEstimator
>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv")
>>> hh = H2ODeepLearningEstimator(hidden=[],
...                               loss="CrossEntropy",
...                               export_weights_and_biases=True)
>>> hh.train(x=list(range(4)), y=4, training_frame=iris)
>>> hh.weights(0)
xval_keys()[source]

Model keys for the cross-validated model.

Examples

>>> from h2o.estimators import H2OGeneralizedLinearEstimator
>>> from h2o.grid.grid_search import H2OGridSearch
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> y = 3
>>> x = [4,5,6,7,8,9,10,11]
>>> hyper_params = {'alpha': [0.01,0.3,0.5],
...                 'lambda': [1e-5, 1e-6, 1e-7]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
...                    hyper_params)
>>> gs.train(x=x,y=y, training_frame=benign)
>>> gs.xval_keys()
xvals()[source]

Return the list of cross-validated models.