#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals
import ast
import json
import warnings
import h2o
from h2o.base import Keyed
from h2o.exceptions import H2OResponseError, H2ODeprecationWarning
from h2o.grid import H2OGridSearch
from h2o.job import H2OJob
from h2o.utils.shared_utils import quoted
from h2o.utils.typechecks import is_type
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric
[docs]class H2OStackedEnsembleEstimator(H2OEstimator):
"""
Stacked Ensemble
Builds a stacked ensemble (aka "super learner") machine learning method that uses two
or more H2O learning algorithms to improve predictive performance. It is a loss-based
supervised learning method that finds the optimal combination of a collection of prediction
algorithms.This method supports regression and binary classification.
:examples:
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> col_types = ["numeric", "numeric", "numeric", "enum",
... "enum", "numeric", "numeric", "numeric", "numeric"]
>>> data = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv", col_types=col_types)
>>> train, test = data.split_frame(ratios=[.8], seed=1)
>>> x = ["CAPSULE","GLEASON","RACE","DPROS","DCAPS","PSA","VOL"]
>>> y = "AGE"
>>> nfolds = 5
>>> gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True)
>>> gbm.train(x=x, y=y, training_frame=train)
>>> rf = H2ORandomForestEstimator(nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True)
>>> rf.train(x=x, y=y, training_frame=train)
>>> stack = H2OStackedEnsembleEstimator(model_id="ensemble",
... training_frame=train,
... validation_frame=test,
... base_models=[gbm.model_id, rf.model_id])
>>> stack.train(x=x, y=y, training_frame=train, validation_frame=test)
>>> stack.model_performance()
"""
algo = "stackedensemble"
supervised_learning = True
_options_ = {'model_extensions': ['h2o.model.extensions.Fairness']}
def __init__(self,
model_id=None, # type: Optional[Union[None, str, H2OEstimator]]
training_frame=None, # type: Optional[Union[None, str, H2OFrame]]
response_column=None, # type: Optional[str]
validation_frame=None, # type: Optional[Union[None, str, H2OFrame]]
blending_frame=None, # type: Optional[Union[None, str, H2OFrame]]
base_models=[], # type: List[str]
metalearner_algorithm="auto", # type: Literal["auto", "deeplearning", "drf", "gbm", "glm", "naivebayes", "xgboost"]
metalearner_nfolds=0, # type: int
metalearner_fold_assignment=None, # type: Optional[Literal["auto", "random", "modulo", "stratified"]]
metalearner_fold_column=None, # type: Optional[str]
metalearner_params=None, # type: Optional[dict]
metalearner_transform="none", # type: Literal["none", "logit"]
max_runtime_secs=0.0, # type: float
weights_column=None, # type: Optional[str]
offset_column=None, # type: Optional[str]
seed=-1, # type: int
score_training_samples=10000, # type: int
keep_levelone_frame=False, # type: bool
export_checkpoints_dir=None, # type: Optional[str]
auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
):
"""
:param model_id: Destination id for this model; auto-generated if not specified.
Defaults to ``None``.
:type model_id: Union[None, str, H2OEstimator], optional
:param training_frame: Id of the training data frame.
Defaults to ``None``.
:type training_frame: Union[None, str, H2OFrame], optional
:param response_column: Response variable column.
Defaults to ``None``.
:type response_column: str, optional
:param validation_frame: Id of the validation data frame.
Defaults to ``None``.
:type validation_frame: Union[None, str, H2OFrame], optional
:param blending_frame: Frame used to compute the predictions that serve as the training frame for the
metalearner (triggers blending mode if provided)
Defaults to ``None``.
:type blending_frame: Union[None, str, H2OFrame], optional
:param base_models: List of models or grids (or their ids) to ensemble/stack together. Grids are expanded to
individual models. If not using blending frame, then models must have been cross-validated using nfolds >
1, and folds must be identical across models.
Defaults to ``[]``.
:type base_models: List[str]
:param metalearner_algorithm: Type of algorithm to use as the metalearner. Options include 'AUTO' (GLM with non
negative weights; if validation_frame is present, a lambda search is performed), 'deeplearning' (Deep
Learning with default parameters), 'drf' (Random Forest with default parameters), 'gbm' (GBM with default
parameters), 'glm' (GLM with default parameters), 'naivebayes' (NaiveBayes with default parameters), or
'xgboost' (if available, XGBoost with default parameters).
Defaults to ``"auto"``.
:type metalearner_algorithm: Literal["auto", "deeplearning", "drf", "gbm", "glm", "naivebayes", "xgboost"]
:param metalearner_nfolds: Number of folds for K-fold cross-validation of the metalearner algorithm (0 to
disable or >= 2).
Defaults to ``0``.
:type metalearner_nfolds: int
:param metalearner_fold_assignment: Cross-validation fold assignment scheme for metalearner cross-validation.
Defaults to AUTO (which is currently set to Random). The 'Stratified' option will stratify the folds
based on the response variable, for classification problems.
Defaults to ``None``.
:type metalearner_fold_assignment: Literal["auto", "random", "modulo", "stratified"], optional
:param metalearner_fold_column: Column with cross-validation fold index assignment per observation for cross-
validation of the metalearner.
Defaults to ``None``.
:type metalearner_fold_column: str, optional
:param metalearner_params: Parameters for metalearner algorithm
Defaults to ``None``.
:type metalearner_params: dict, optional
:param metalearner_transform: Transformation used for the level one frame.
Defaults to ``"none"``.
:type metalearner_transform: Literal["none", "logit"]
:param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
Defaults to ``0.0``.
:type max_runtime_secs: float
:param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
not increase the size of the data frame. This is typically the number of times a row is repeated, but
non-integer values are supported as well. During training, rows with higher weights matter more, due to
the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
Defaults to ``None``.
:type weights_column: str, optional
:param offset_column: Offset column. This will be added to the combination of columns before applying the link
function.
Defaults to ``None``.
:type offset_column: str, optional
:param seed: Seed for random numbers; passed through to the metalearner algorithm. Defaults to -1 (time-based
random number)
Defaults to ``-1``.
:type seed: int
:param score_training_samples: Specify the number of training set samples for scoring. The value must be >= 0.
To use all training samples, enter 0.
Defaults to ``10000``.
:type score_training_samples: int
:param keep_levelone_frame: Keep level one frame used for metalearner training.
Defaults to ``False``.
:type keep_levelone_frame: bool
:param export_checkpoints_dir: Automatically export generated models to this directory.
Defaults to ``None``.
:type export_checkpoints_dir: str, optional
:param auc_type: Set default multinomial AUC type.
Defaults to ``"auto"``.
:type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
"""
super(H2OStackedEnsembleEstimator, self).__init__()
self._parms = {}
self._id = self._parms['model_id'] = model_id
self.training_frame = training_frame
self.response_column = response_column
self.validation_frame = validation_frame
self.blending_frame = blending_frame
self.base_models = base_models
self.metalearner_algorithm = metalearner_algorithm
self.metalearner_nfolds = metalearner_nfolds
self.metalearner_fold_assignment = metalearner_fold_assignment
self.metalearner_fold_column = metalearner_fold_column
self.metalearner_params = metalearner_params
self.metalearner_transform = metalearner_transform
self.max_runtime_secs = max_runtime_secs
self.weights_column = weights_column
self.offset_column = offset_column
self.seed = seed
self.score_training_samples = score_training_samples
self.keep_levelone_frame = keep_levelone_frame
self.export_checkpoints_dir = export_checkpoints_dir
self.auc_type = auc_type
self._parms["_rest_version"] = 99
@property
def training_frame(self):
"""
Id of the training data frame.
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, valid = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=1,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... metalearner_fold_assignment="Random")
>>> stack_blend.train(x=x, y=y, training_frame=train, validation_frame=valid)
>>> stack_blend.model_performance(blend).auc()
"""
return self._parms.get("training_frame")
@training_frame.setter
def training_frame(self, training_frame):
self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
@property
def response_column(self):
"""
Response variable column.
Type: ``str``.
"""
return self._parms.get("response_column")
@response_column.setter
def response_column(self, response_column):
assert_is_type(response_column, None, str)
self._parms["response_column"] = response_column
@property
def validation_frame(self):
"""
Id of the validation data frame.
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, valid = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=1,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... metalearner_fold_assignment="Random")
>>> stack_blend.train(x=x, y=y, training_frame=train, validation_frame=valid)
>>> stack_blend.model_performance(blend).auc()
"""
return self._parms.get("validation_frame")
@validation_frame.setter
def validation_frame(self, validation_frame):
self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
@property
def blending_frame(self):
"""
Frame used to compute the predictions that serve as the training frame for the metalearner (triggers blending
mode if provided)
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=10,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
"""
return self._parms.get("blending_frame")
@blending_frame.setter
def blending_frame(self, blending_frame):
self._parms["blending_frame"] = H2OFrame._validate(blending_frame, 'blending_frame')
@property
def base_models(self):
"""
List of models or grids (or their ids) to ensemble/stack together. Grids are expanded to individual models. If
not using blending frame, then models must have been cross-validated using nfolds > 1, and folds must be
identical across models.
Type: ``List[str]``, defaults to ``[]``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> col_types = ["numeric", "numeric", "numeric", "enum",
... "enum", "numeric", "numeric", "numeric", "numeric"]
>>> data = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv", col_types=col_types)
>>> train, test = data.split_frame(ratios=[.8], seed=1)
>>> x = ["CAPSULE","GLEASON","RACE","DPROS","DCAPS","PSA","VOL"]
>>> y = "AGE"
>>> nfolds = 5
>>> gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True)
>>> gbm.train(x=x, y=y, training_frame=train)
>>> rf = H2ORandomForestEstimator(nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True)
>>> rf.train(x=x, y=y, training_frame=train)
>>> stack = H2OStackedEnsembleEstimator(model_id="ensemble",
... training_frame=train,
... validation_frame=test,
... base_models=[gbm.model_id, rf.model_id])
>>> stack.train(x=x, y=y, training_frame=train, validation_frame=test)
>>> stack.model_performance()
"""
base_models = self.actual_params.get("base_models", [])
base_models = [base_model["name"] for base_model in base_models]
if len(base_models) == 0:
base_models = self._parms.get("base_models")
return base_models
@base_models.setter
def base_models(self, base_models):
def _get_id(something):
if isinstance(something, Keyed):
return something.key
return something
if not is_type(base_models, list):
base_models = [base_models]
if is_type(base_models, [H2OEstimator, H2OGridSearch, str]):
base_models = [_get_id(b) for b in base_models]
self._parms["base_models"] = base_models
else:
assert_is_type(base_models, None)
@property
def metalearner_algorithm(self):
"""
Type of algorithm to use as the metalearner. Options include 'AUTO' (GLM with non negative weights; if
validation_frame is present, a lambda search is performed), 'deeplearning' (Deep Learning with default
parameters), 'drf' (Random Forest with default parameters), 'gbm' (GBM with default parameters), 'glm' (GLM with
default parameters), 'naivebayes' (NaiveBayes with default parameters), or 'xgboost' (if available, XGBoost with
default parameters).
Type: ``Literal["auto", "deeplearning", "drf", "gbm", "glm", "naivebayes", "xgboost"]``, defaults to ``"auto"``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=1,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... metalearner_algorithm="gbm")
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
"""
return self._parms.get("metalearner_algorithm")
@metalearner_algorithm.setter
def metalearner_algorithm(self, metalearner_algorithm):
assert_is_type(metalearner_algorithm, None, Enum("auto", "deeplearning", "drf", "gbm", "glm", "naivebayes", "xgboost"))
self._parms["metalearner_algorithm"] = metalearner_algorithm
@property
def metalearner_nfolds(self):
"""
Number of folds for K-fold cross-validation of the metalearner algorithm (0 to disable or >= 2).
Type: ``int``, defaults to ``0``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=1,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... metalearner_nfolds=3)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
"""
return self._parms.get("metalearner_nfolds")
@metalearner_nfolds.setter
def metalearner_nfolds(self, metalearner_nfolds):
assert_is_type(metalearner_nfolds, None, int)
self._parms["metalearner_nfolds"] = metalearner_nfolds
@property
def metalearner_fold_assignment(self):
"""
Cross-validation fold assignment scheme for metalearner cross-validation. Defaults to AUTO (which is currently
set to Random). The 'Stratified' option will stratify the folds based on the response variable, for
classification problems.
Type: ``Literal["auto", "random", "modulo", "stratified"]``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=1,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... metalearner_fold_assignment="Random")
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
"""
return self._parms.get("metalearner_fold_assignment")
@metalearner_fold_assignment.setter
def metalearner_fold_assignment(self, metalearner_fold_assignment):
assert_is_type(metalearner_fold_assignment, None, Enum("auto", "random", "modulo", "stratified"))
self._parms["metalearner_fold_assignment"] = metalearner_fold_assignment
@property
def metalearner_fold_column(self):
"""
Column with cross-validation fold index assignment per observation for cross-validation of the metalearner.
Type: ``str``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_test_5k.csv")
>>> fold_column = "fold_id"
>>> train[fold_column] = train.kfold_column(n_folds=3, seed=1)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> x.remove(fold_column)
>>> train[y] = train[y].asfactor()
>>> test[y] = test[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=10,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... metalearner_fold_column=fold_column,
... metalearner_params=dict(keep_cross_validation_models=True))
>>> stack.train(x=x, y=y, training_frame=train)
>>> stack.model_performance().auc()
"""
return self._parms.get("metalearner_fold_column")
@metalearner_fold_column.setter
def metalearner_fold_column(self, metalearner_fold_column):
assert_is_type(metalearner_fold_column, None, str)
self._parms["metalearner_fold_column"] = metalearner_fold_column
@property
def metalearner_params(self):
"""
Parameters for metalearner algorithm
Type: ``dict``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> gbm_params = {"ntrees" : 100, "max_depth" : 6}
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=1,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... metalearner_algorithm="gbm",
... metalearner_params=gbm_params)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
"""
if self._parms.get("metalearner_params") != None:
metalearner_params_dict = ast.literal_eval(self._parms.get("metalearner_params"))
for k in metalearner_params_dict:
if len(metalearner_params_dict[k]) == 1: #single parameter
metalearner_params_dict[k] = metalearner_params_dict[k][0]
return metalearner_params_dict
else:
return self._parms.get("metalearner_params")
@metalearner_params.setter
def metalearner_params(self, metalearner_params):
assert_is_type(metalearner_params, None, dict)
if metalearner_params is not None and metalearner_params != "":
for k in metalearner_params:
if ("[" and "]") not in str(metalearner_params[k]):
metalearner_params[k] = [metalearner_params[k]]
self._parms["metalearner_params"] = str(json.dumps(metalearner_params))
else:
self._parms["metalearner_params"] = None
@property
def metalearner_transform(self):
"""
Transformation used for the level one frame.
Type: ``Literal["none", "logit"]``, defaults to ``"none"``.
"""
return self._parms.get("metalearner_transform")
@metalearner_transform.setter
def metalearner_transform(self, metalearner_transform):
assert_is_type(metalearner_transform, None, Enum("none", "logit"))
self._parms["metalearner_transform"] = metalearner_transform
@property
def max_runtime_secs(self):
"""
Maximum allowed runtime in seconds for model training. Use 0 to disable.
Type: ``float``, defaults to ``0.0``.
"""
return self._parms.get("max_runtime_secs")
@max_runtime_secs.setter
def max_runtime_secs(self, max_runtime_secs):
assert_is_type(max_runtime_secs, None, numeric)
self._parms["max_runtime_secs"] = max_runtime_secs
@property
def weights_column(self):
"""
Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
accurate prediction, remove all rows with weight == 0.
Type: ``str``.
"""
return self._parms.get("weights_column")
@weights_column.setter
def weights_column(self, weights_column):
assert_is_type(weights_column, None, str)
self._parms["weights_column"] = weights_column
@property
def offset_column(self):
"""
Offset column. This will be added to the combination of columns before applying the link function.
Type: ``str``.
"""
return self._parms.get("offset_column")
@offset_column.setter
def offset_column(self, offset_column):
assert_is_type(offset_column, None, str)
self._parms["offset_column"] = offset_column
@property
def seed(self):
"""
Seed for random numbers; passed through to the metalearner algorithm. Defaults to -1 (time-based random number)
Type: ``int``, defaults to ``-1``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=1,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... metalearner_fold_assignment="Random")
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
"""
return self._parms.get("seed")
@seed.setter
def seed(self, seed):
assert_is_type(seed, None, int)
self._parms["seed"] = seed
@property
def score_training_samples(self):
"""
Specify the number of training set samples for scoring. The value must be >= 0. To use all training samples,
enter 0.
Type: ``int``, defaults to ``10000``.
"""
return self._parms.get("score_training_samples")
@score_training_samples.setter
def score_training_samples(self, score_training_samples):
assert_is_type(score_training_samples, None, int)
self._parms["score_training_samples"] = score_training_samples
@property
def keep_levelone_frame(self):
"""
Keep level one frame used for metalearner training.
Type: ``bool``, defaults to ``False``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=1,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... keep_levelone_frame=True)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.model_performance(blend).auc()
"""
return self._parms.get("keep_levelone_frame")
@keep_levelone_frame.setter
def keep_levelone_frame(self, keep_levelone_frame):
assert_is_type(keep_levelone_frame, None, bool)
self._parms["keep_levelone_frame"] = keep_levelone_frame
@property
def export_checkpoints_dir(self):
"""
Automatically export generated models to this directory.
Type: ``str``.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> import tempfile
>>> from os import listdir
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> checkpoints_dir = tempfile.mkdtemp()
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=10,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... export_checkpoints_dir=checkpoints_dir)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> len(listdir(checkpoints_dir))
"""
return self._parms.get("export_checkpoints_dir")
@export_checkpoints_dir.setter
def export_checkpoints_dir(self, export_checkpoints_dir):
assert_is_type(export_checkpoints_dir, None, str)
self._parms["export_checkpoints_dir"] = export_checkpoints_dir
@property
def auc_type(self):
"""
Set default multinomial AUC type.
Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to
``"auto"``.
"""
return self._parms.get("auc_type")
@auc_type.setter
def auc_type(self, auc_type):
assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"))
self._parms["auc_type"] = auc_type
[docs] def levelone_frame_id(self):
"""Fetch the levelone_frame_id for an H2OStackedEnsembleEstimator.
:examples:
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
>>> higgs = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/higgs_train_5k.csv")
>>> train, blend = higgs.split_frame(ratios = [.8], seed = 1234)
>>> x = train.columns
>>> y = "response"
>>> x.remove(y)
>>> train[y] = train[y].asfactor()
>>> blend[y] = blend[y].asfactor()
>>> nfolds = 3
>>> my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
... ntrees=10,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_gbm.train(x=x, y=y, training_frame=train)
>>> my_rf = H2ORandomForestEstimator(ntrees=50,
... nfolds=nfolds,
... fold_assignment="Modulo",
... keep_cross_validation_predictions=True,
... seed=1)
>>> my_rf.train(x=x, y=y, training_frame=train)
>>> stack_blend = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
... seed=1,
... keep_levelone_frame=True)
>>> stack_blend.train(x=x, y=y, training_frame=train, blending_frame=blend)
>>> stack_blend.levelone_frame_id()
"""
model = self._model_json["output"]
if "levelone_frame_id" in model and model["levelone_frame_id"] is not None:
return model["levelone_frame_id"]
print("No levelone_frame_id for this model")
def stacking_strategy(self):
model = self._model_json["output"]
if "stacking_strategy" in model and model["stacking_strategy"] is not None:
return model["stacking_strategy"]
print("No stacking strategy for this model")
# Override train method to support blending
[docs] def train(self, x=None, y=None, training_frame=None, blending_frame=None, verbose=False, **kwargs):
has_training_frame = training_frame is not None or self.training_frame is not None
blending_frame = H2OFrame._validate(blending_frame, 'blending_frame', required=not has_training_frame)
if not has_training_frame:
training_frame = blending_frame # used to bypass default checks in super class and backend and to guarantee default metrics
sup = super(self.__class__, self)
def extend_parms(parms):
if blending_frame is not None:
parms['blending_frame'] = blending_frame
if self.metalearner_fold_column is not None:
parms['ignored_columns'].remove(quoted(self.metalearner_fold_column))
parms = sup._make_parms(x, y, training_frame, extend_parms_fn=extend_parms, **kwargs)
sup._train(parms, verbose=verbose)
if self.metalearner() is None:
raise H2OResponseError("Meta learner didn't get to be trained in time. "
"Try increasing max_runtime_secs or setting it to 0 (unlimited).")
return self