#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals
from h2o.utils.metaclass import deprecated_params, deprecated_property
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric
[docs]class H2ORandomForestEstimator(H2OEstimator):
"""
Distributed Random Forest
Builds a Distributed Random Forest (DRF) on a parsed dataset, for regression or
classification.
"""
algo = "drf"
supervised_learning = True
_options_ = {'model_extensions': ['h2o.model.extensions.ScoringHistoryTrees',
'h2o.model.extensions.VariableImportance',
'h2o.model.extensions.Trees',
'h2o.model.extensions.SupervisedTrees'],
'verbose': True}
@deprecated_params({'offset_column': None})
def __init__(self,
model_id=None, # type: Optional[Union[None, str, H2OEstimator]]
training_frame=None, # type: Optional[Union[None, str, H2OFrame]]
validation_frame=None, # type: Optional[Union[None, str, H2OFrame]]
nfolds=0, # type: int
keep_cross_validation_models=True, # type: bool
keep_cross_validation_predictions=False, # type: bool
keep_cross_validation_fold_assignment=False, # type: bool
score_each_iteration=False, # type: bool
score_tree_interval=0, # type: int
fold_assignment="auto", # type: Literal["auto", "random", "modulo", "stratified"]
fold_column=None, # type: Optional[str]
response_column=None, # type: Optional[str]
ignored_columns=None, # type: Optional[List[str]]
ignore_const_cols=True, # type: bool
weights_column=None, # type: Optional[str]
balance_classes=False, # type: bool
class_sampling_factors=None, # type: Optional[List[float]]
max_after_balance_size=5.0, # type: float
max_confusion_matrix_size=20, # type: int
ntrees=50, # type: int
max_depth=20, # type: int
min_rows=1.0, # type: float
nbins=20, # type: int
nbins_top_level=1024, # type: int
nbins_cats=1024, # type: int
r2_stopping=None, # type: Optional[float]
stopping_rounds=0, # type: int
stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]
stopping_tolerance=0.001, # type: float
max_runtime_secs=0.0, # type: float
seed=-1, # type: int
build_tree_one_node=False, # type: bool
mtries=-1, # type: int
sample_rate=0.632, # type: float
sample_rate_per_class=None, # type: Optional[List[float]]
binomial_double_trees=False, # type: bool
checkpoint=None, # type: Optional[Union[None, str, H2OEstimator]]
col_sample_rate_change_per_level=1.0, # type: float
col_sample_rate_per_tree=1.0, # type: float
min_split_improvement=1e-05, # type: float
histogram_type="auto", # type: Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin"]
categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]
calibrate_model=False, # type: bool
calibration_frame=None, # type: Optional[Union[None, str, H2OFrame]]
distribution="auto", # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]
custom_metric_func=None, # type: Optional[str]
export_checkpoints_dir=None, # type: Optional[str]
check_constant_response=True, # type: bool
gainslift_bins=-1, # type: int
auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
):
"""
:param model_id: Destination id for this model; auto-generated if not specified.
Defaults to ``None``.
:type model_id: Union[None, str, H2OEstimator], optional
:param training_frame: Id of the training data frame.
Defaults to ``None``.
:type training_frame: Union[None, str, H2OFrame], optional
:param validation_frame: Id of the validation data frame.
Defaults to ``None``.
:type validation_frame: Union[None, str, H2OFrame], optional
:param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2).
Defaults to ``0``.
:type nfolds: int
:param keep_cross_validation_models: Whether to keep the cross-validation models.
Defaults to ``True``.
:type keep_cross_validation_models: bool
:param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models.
Defaults to ``False``.
:type keep_cross_validation_predictions: bool
:param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment.
Defaults to ``False``.
:type keep_cross_validation_fold_assignment: bool
:param score_each_iteration: Whether to score during each iteration of model training.
Defaults to ``False``.
:type score_each_iteration: bool
:param score_tree_interval: Score the model after every so many trees. Disabled if set to 0.
Defaults to ``0``.
:type score_tree_interval: int
:param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The
'Stratified' option will stratify the folds based on the response variable, for classification problems.
Defaults to ``"auto"``.
:type fold_assignment: Literal["auto", "random", "modulo", "stratified"]
:param fold_column: Column with cross-validation fold index assignment per observation.
Defaults to ``None``.
:type fold_column: str, optional
:param response_column: Response variable column.
Defaults to ``None``.
:type response_column: str, optional
:param ignored_columns: Names of columns to ignore for training.
Defaults to ``None``.
:type ignored_columns: List[str], optional
:param ignore_const_cols: Ignore constant columns.
Defaults to ``True``.
:type ignore_const_cols: bool
:param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
not increase the size of the data frame. This is typically the number of times a row is repeated, but
non-integer values are supported as well. During training, rows with higher weights matter more, due to
the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
Defaults to ``None``.
:type weights_column: str, optional
:param balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).
Defaults to ``False``.
:type balance_classes: bool
:param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not
specified, sampling factors will be automatically computed to obtain class balance during training.
Requires balance_classes.
Defaults to ``None``.
:type class_sampling_factors: List[float], optional
:param max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be
less than 1.0). Requires balance_classes.
Defaults to ``5.0``.
:type max_after_balance_size: float
:param max_confusion_matrix_size: [Deprecated] Maximum size (# classes) for confusion matrices to be printed in
the Logs
Defaults to ``20``.
:type max_confusion_matrix_size: int
:param ntrees: Number of trees.
Defaults to ``50``.
:type ntrees: int
:param max_depth: Maximum tree depth (0 for unlimited).
Defaults to ``20``.
:type max_depth: int
:param min_rows: Fewest allowed (weighted) observations in a leaf.
Defaults to ``1.0``.
:type min_rows: float
:param nbins: For numerical columns (real/int), build a histogram of (at least) this many bins, then split at
the best point
Defaults to ``20``.
:type nbins: int
:param nbins_top_level: For numerical columns (real/int), build a histogram of (at most) this many bins at the
root level, then decrease by factor of two per level
Defaults to ``1024``.
:type nbins_top_level: int
:param nbins_cats: For categorical columns (factors), build a histogram of this many bins, then split at the
best point. Higher values can lead to more overfitting.
Defaults to ``1024``.
:type nbins_cats: int
:param r2_stopping: r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds,
stopping_metric and stopping_tolerance instead. Previous version of H2O would stop making trees when the
R^2 metric equals or exceeds this
Defaults to ``∞``.
:type r2_stopping: float
:param stopping_rounds: Early stopping based on convergence of stopping_metric. Stop if simple moving average of
length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
Defaults to ``0``.
:type stopping_rounds: int
:param stopping_metric: Metric to use for early stopping (AUTO: logloss for classification, deviance for
regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be
used in GBM and DRF with the Python client.
Defaults to ``"auto"``.
:type stopping_metric: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group",
"misclassification", "mean_per_class_error", "custom", "custom_increasing"]
:param stopping_tolerance: Relative tolerance for metric-based stopping criterion (stop if relative improvement
is not at least this much)
Defaults to ``0.001``.
:type stopping_tolerance: float
:param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
Defaults to ``0.0``.
:type max_runtime_secs: float
:param seed: Seed for pseudo random number generator (if applicable)
Defaults to ``-1``.
:type seed: int
:param build_tree_one_node: Run on one node only; no network overhead but fewer cpus used. Suitable for small
datasets.
Defaults to ``False``.
:type build_tree_one_node: bool
:param mtries: Number of variables randomly sampled as candidates at each split. If set to -1, defaults to
sqrt{p} for classification and p/3 for regression (where p is the # of predictors
Defaults to ``-1``.
:type mtries: int
:param sample_rate: Row sample rate per tree (from 0.0 to 1.0)
Defaults to ``0.632``.
:type sample_rate: float
:param sample_rate_per_class: A list of row sample rates per class (relative fraction for each class, from 0.0
to 1.0), for each tree
Defaults to ``None``.
:type sample_rate_per_class: List[float], optional
:param binomial_double_trees: For binary classification: Build 2x as many trees (one per class) - can lead to
higher accuracy.
Defaults to ``False``.
:type binomial_double_trees: bool
:param checkpoint: Model checkpoint to resume training with.
Defaults to ``None``.
:type checkpoint: Union[None, str, H2OEstimator], optional
:param col_sample_rate_change_per_level: Relative change of the column sampling rate for every level (must be >
0.0 and <= 2.0)
Defaults to ``1.0``.
:type col_sample_rate_change_per_level: float
:param col_sample_rate_per_tree: Column sample rate per tree (from 0.0 to 1.0)
Defaults to ``1.0``.
:type col_sample_rate_per_tree: float
:param min_split_improvement: Minimum relative improvement in squared error reduction for a split to happen
Defaults to ``1e-05``.
:type min_split_improvement: float
:param histogram_type: What type of histogram to use for finding optimal split points
Defaults to ``"auto"``.
:type histogram_type: Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin"]
:param categorical_encoding: Encoding scheme for categorical features
Defaults to ``"auto"``.
:type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
"sort_by_response", "enum_limited"]
:param calibrate_model: Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide
more accurate estimates of class probabilities.
Defaults to ``False``.
:type calibrate_model: bool
:param calibration_frame: Calibration frame for Platt Scaling
Defaults to ``None``.
:type calibration_frame: Union[None, str, H2OFrame], optional
:param distribution: Distribution function
Defaults to ``"auto"``.
:type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
"quantile", "huber"]
:param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName`
Defaults to ``None``.
:type custom_metric_func: str, optional
:param export_checkpoints_dir: Automatically export generated models to this directory.
Defaults to ``None``.
:type export_checkpoints_dir: str, optional
:param check_constant_response: Check if response column is constant. If enabled, then an exception is thrown if
the response column is a constant value.If disabled, then model will train regardless of the response
column being a constant value or not.
Defaults to ``True``.
:type check_constant_response: bool
:param gainslift_bins: Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic
binning.
Defaults to ``-1``.
:type gainslift_bins: int
:param auc_type: Set default multinomial AUC type.
Defaults to ``"auto"``.
:type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
"""
super(H2ORandomForestEstimator, self).__init__()
self._parms = {}
self._id = self._parms['model_id'] = model_id
self.training_frame = training_frame
self.validation_frame = validation_frame
self.nfolds = nfolds
self.keep_cross_validation_models = keep_cross_validation_models
self.keep_cross_validation_predictions = keep_cross_validation_predictions
self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment
self.score_each_iteration = score_each_iteration
self.score_tree_interval = score_tree_interval
self.fold_assignment = fold_assignment
self.fold_column = fold_column
self.response_column = response_column
self.ignored_columns = ignored_columns
self.ignore_const_cols = ignore_const_cols
self.weights_column = weights_column
self.balance_classes = balance_classes
self.class_sampling_factors = class_sampling_factors
self.max_after_balance_size = max_after_balance_size
self.max_confusion_matrix_size = max_confusion_matrix_size
self.ntrees = ntrees
self.max_depth = max_depth
self.min_rows = min_rows
self.nbins = nbins
self.nbins_top_level = nbins_top_level
self.nbins_cats = nbins_cats
self.r2_stopping = r2_stopping
self.stopping_rounds = stopping_rounds
self.stopping_metric = stopping_metric
self.stopping_tolerance = stopping_tolerance
self.max_runtime_secs = max_runtime_secs
self.seed = seed
self.build_tree_one_node = build_tree_one_node
self.mtries = mtries
self.sample_rate = sample_rate
self.sample_rate_per_class = sample_rate_per_class
self.binomial_double_trees = binomial_double_trees
self.checkpoint = checkpoint
self.col_sample_rate_change_per_level = col_sample_rate_change_per_level
self.col_sample_rate_per_tree = col_sample_rate_per_tree
self.min_split_improvement = min_split_improvement
self.histogram_type = histogram_type
self.categorical_encoding = categorical_encoding
self.calibrate_model = calibrate_model
self.calibration_frame = calibration_frame
self.distribution = distribution
self.custom_metric_func = custom_metric_func
self.export_checkpoints_dir = export_checkpoints_dir
self.check_constant_response = check_constant_response
self.gainslift_bins = gainslift_bins
self.auc_type = auc_type
@property
def training_frame(self):
"""
Id of the training data frame.
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_drf = H2ORandomForestEstimator(seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_drf.auc(valid=True)
"""
return self._parms.get("training_frame")
@training_frame.setter
def training_frame(self, training_frame):
self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
@property
def validation_frame(self):
"""
Id of the validation data frame.
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_drf = H2ORandomForestEstimator(seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_drf.auc(valid=True)
"""
return self._parms.get("validation_frame")
@validation_frame.setter
def validation_frame(self, validation_frame):
self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
@property
def nfolds(self):
"""
Number of folds for K-fold cross-validation (0 to disable or >= 2).
Type: ``int``, defaults to ``0``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> folds = 5
>>> cars_drf = H2ORandomForestEstimator(nfolds=folds,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=cars)
>>> cars_drf.auc(xval=True)
"""
return self._parms.get("nfolds")
@nfolds.setter
def nfolds(self, nfolds):
assert_is_type(nfolds, None, int)
self._parms["nfolds"] = nfolds
@property
def keep_cross_validation_models(self):
"""
Whether to keep the cross-validation models.
Type: ``bool``, defaults to ``True``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_models=True,
... nfolds=5,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train)
>>> cars_drf.auc()
"""
return self._parms.get("keep_cross_validation_models")
@keep_cross_validation_models.setter
def keep_cross_validation_models(self, keep_cross_validation_models):
assert_is_type(keep_cross_validation_models, None, bool)
self._parms["keep_cross_validation_models"] = keep_cross_validation_models
@property
def keep_cross_validation_predictions(self):
"""
Whether to keep the predictions of the cross-validation models.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_predictions=True,
... nfolds=5,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train)
>>> cars_drf.cross_validation_predictions()
"""
return self._parms.get("keep_cross_validation_predictions")
@keep_cross_validation_predictions.setter
def keep_cross_validation_predictions(self, keep_cross_validation_predictions):
assert_is_type(keep_cross_validation_predictions, None, bool)
self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions
@property
def keep_cross_validation_fold_assignment(self):
"""
Whether to keep the cross-validation fold assignment.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_fold_assignment=True,
... nfolds=5,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train)
>>> cars_drf.cross_validation_fold_assignment()
"""
return self._parms.get("keep_cross_validation_fold_assignment")
@keep_cross_validation_fold_assignment.setter
def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment):
assert_is_type(keep_cross_validation_fold_assignment, None, bool)
self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment
@property
def score_each_iteration(self):
"""
Whether to score during each iteration of model training.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(score_each_iteration=True,
... ntrees=55,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame = valid)
>>> cars_drf.scoring_history()
"""
return self._parms.get("score_each_iteration")
@score_each_iteration.setter
def score_each_iteration(self, score_each_iteration):
assert_is_type(score_each_iteration, None, bool)
self._parms["score_each_iteration"] = score_each_iteration
@property
def score_tree_interval(self):
"""
Score the model after every so many trees. Disabled if set to 0.
Type: ``int``, defaults to ``0``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(score_tree_interval=5,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_drf.scoring_history()
"""
return self._parms.get("score_tree_interval")
@score_tree_interval.setter
def score_tree_interval(self, score_tree_interval):
assert_is_type(score_tree_interval, None, int)
self._parms["score_tree_interval"] = score_tree_interval
@property
def fold_assignment(self):
"""
Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
the folds based on the response variable, for classification problems.
Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> assignment_type = "Random"
>>> cars_drf = H2ORandomForestEstimator(fold_assignment=assignment_type,
... nfolds=5,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=cars)
>>> cars_drf.auc(xval=True)
"""
return self._parms.get("fold_assignment")
@fold_assignment.setter
def fold_assignment(self, fold_assignment):
assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified"))
self._parms["fold_assignment"] = fold_assignment
@property
def fold_column(self):
"""
Column with cross-validation fold index assignment per observation.
Type: ``str``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> print(cars['fold_numbers'])
>>> cars_drf = H2ORandomForestEstimator(seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=cars,
... fold_column="fold_numbers")
>>> cars_drf.auc(xval=True)
"""
return self._parms.get("fold_column")
@fold_column.setter
def fold_column(self, fold_column):
assert_is_type(fold_column, None, str)
self._parms["fold_column"] = fold_column
@property
def response_column(self):
"""
Response variable column.
Type: ``str``.
"""
return self._parms.get("response_column")
@response_column.setter
def response_column(self, response_column):
assert_is_type(response_column, None, str)
self._parms["response_column"] = response_column
@property
def ignored_columns(self):
"""
Names of columns to ignore for training.
Type: ``List[str]``.
"""
return self._parms.get("ignored_columns")
@ignored_columns.setter
def ignored_columns(self, ignored_columns):
assert_is_type(ignored_columns, None, [str])
self._parms["ignored_columns"] = ignored_columns
@property
def ignore_const_cols(self):
"""
Ignore constant columns.
Type: ``bool``, defaults to ``True``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(seed=1234,
... ignore_const_cols=True)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_drf.auc(valid=True)
"""
return self._parms.get("ignore_const_cols")
@ignore_const_cols.setter
def ignore_const_cols(self, ignore_const_cols):
assert_is_type(ignore_const_cols, None, bool)
self._parms["ignore_const_cols"] = ignore_const_cols
@property
def weights_column(self):
"""
Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
accurate prediction, remove all rows with weight == 0.
Type: ``str``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_drf = H2ORandomForestEstimator(seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid,
... weights_column="weight")
>>> cars_drf.auc(valid=True)
"""
return self._parms.get("weights_column")
@weights_column.setter
def weights_column(self, weights_column):
assert_is_type(weights_column, None, str)
self._parms["weights_column"] = weights_column
@property
def balance_classes(self):
"""
Balance training data class counts via over/under-sampling (for imbalanced data).
Type: ``bool``, defaults to ``False``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_drf = H2ORandomForestEstimator(balance_classes=True,
... seed=1234)
>>> cov_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
"""
return self._parms.get("balance_classes")
@balance_classes.setter
def balance_classes(self, balance_classes):
assert_is_type(balance_classes, None, bool)
self._parms["balance_classes"] = balance_classes
@property
def class_sampling_factors(self):
"""
Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
be automatically computed to obtain class balance during training. Requires balance_classes.
Type: ``List[float]``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> print(covtype[54].table())
>>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
>>> cov_drf = H2ORandomForestEstimator(balance_classes=True,
... class_sampling_factors=sample_factors,
... seed=1234)
>>> cov_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
"""
return self._parms.get("class_sampling_factors")
@class_sampling_factors.setter
def class_sampling_factors(self, class_sampling_factors):
assert_is_type(class_sampling_factors, None, [float])
self._parms["class_sampling_factors"] = class_sampling_factors
@property
def max_after_balance_size(self):
"""
Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires
balance_classes.
Type: ``float``, defaults to ``5.0``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> print(covtype[54].table())
>>> max = .85
>>> cov_drf = H2ORandomForestEstimator(balance_classes=True,
... max_after_balance_size=max,
... seed=1234)
>>> cov_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
"""
return self._parms.get("max_after_balance_size")
@max_after_balance_size.setter
def max_after_balance_size(self, max_after_balance_size):
assert_is_type(max_after_balance_size, None, float)
self._parms["max_after_balance_size"] = max_after_balance_size
@property
def max_confusion_matrix_size(self):
"""
[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs
Type: ``int``, defaults to ``20``.
"""
return self._parms.get("max_confusion_matrix_size")
@max_confusion_matrix_size.setter
def max_confusion_matrix_size(self, max_confusion_matrix_size):
assert_is_type(max_confusion_matrix_size, None, int)
self._parms["max_confusion_matrix_size"] = max_confusion_matrix_size
@property
def ntrees(self):
"""
Number of trees.
Type: ``int``, defaults to ``50``.
:examples:
>>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> titanic['survived'] = titanic['survived'].asfactor()
>>> predictors = titanic.columns
>>> del predictors[1:3]
>>> response = 'survived'
>>> train, valid = titanic.split_frame(ratios=[.8],
... seed=1234)
>>> tree_num = [20, 50, 80, 110,
... 140, 170, 200]
>>> label = ["20", "50", "80", "110",
... "140", "170", "200"]
>>> for key, num in enumerate(tree_num):
# Input an integer for 'num' and 'key'
>>> titanic_drf = H2ORandomForestEstimator(ntrees=num,
... seed=1234)
>>> titanic_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(label[key], 'training score',
... titanic_drf.auc(train=True))
>>> print(label[key], 'validation score',
... titanic_drf.auc(valid=True))
"""
return self._parms.get("ntrees")
@ntrees.setter
def ntrees(self, ntrees):
assert_is_type(ntrees, None, int)
self._parms["ntrees"] = ntrees
@property
def max_depth(self):
"""
Maximum tree depth (0 for unlimited).
Type: ``int``, defaults to ``20``.
:examples:
>>> df = h2o.import_file(path = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
>>> response = "survived"
>>> df[response] = df[response].asfactor()
>>> predictors = df.columns
>>> del predictors[1:3]
>>> train, valid, test = df.split_frame(ratios=[0.6,0.2],
... seed=1234,
... destination_frames=
... ['train.hex','valid.hex','test.hex'])
>>> drf = H2ORandomForestEstimator()
>>> drf.train(x=predictors,
... y=response,
... training_frame=train)
>>> perf = drf.model_performance(valid)
>>> print perf.auc()
"""
return self._parms.get("max_depth")
@max_depth.setter
def max_depth(self, max_depth):
assert_is_type(max_depth, None, int)
self._parms["max_depth"] = max_depth
@property
def min_rows(self):
"""
Fewest allowed (weighted) observations in a leaf.
Type: ``float``, defaults to ``1.0``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(min_rows=16,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cars_drf.auc(valid=True))
"""
return self._parms.get("min_rows")
@min_rows.setter
def min_rows(self, min_rows):
assert_is_type(min_rows, None, numeric)
self._parms["min_rows"] = min_rows
@property
def nbins(self):
"""
For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point
Type: ``int``, defaults to ``20``.
:examples:
>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [16, 32, 64, 128, 256, 512]
>>> label = ["16", "32", "64", "128", "256", "512"]
>>> for key, num in enumerate(bin_num):
# Insert integer for 'num' and 'key'
>>> eeg_drf = H2ORandomForestEstimator(nbins=num, seed=1234)
>>> eeg_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(label[key], 'training score',
... eeg_drf.auc(train=True))
>>> print(label[key], 'validation score',
... eeg_drf.auc(train=True))
"""
return self._parms.get("nbins")
@nbins.setter
def nbins(self, nbins):
assert_is_type(nbins, None, int)
self._parms["nbins"] = nbins
@property
def nbins_top_level(self):
"""
For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease
by factor of two per level
Type: ``int``, defaults to ``1024``.
:examples:
>>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv")
>>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor()
>>> predictors = eeg.columns[:-1]
>>> response = 'eyeDetection'
>>> train, valid = eeg.split_frame(ratios=[.8],
... seed=1234)
>>> bin_num = [32, 64, 128, 256, 512,
... 1024, 2048, 4096]
>>> label = ["32", "64", "128", "256",
... "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
# Insert integer for 'num' and 'key'
>>> eeg_drf = H2ORandomForestEstimator(nbins_top_level=32,
... seed=1234)
>>> eeg_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(label[key], 'training score',
... eeg_gbm.auc(train=True))
>>> print(label[key], 'validation score',
... eeg_gbm.auc(valid=True))
"""
return self._parms.get("nbins_top_level")
@nbins_top_level.setter
def nbins_top_level(self, nbins_top_level):
assert_is_type(nbins_top_level, None, int)
self._parms["nbins_top_level"] = nbins_top_level
@property
def nbins_cats(self):
"""
For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher
values can lead to more overfitting.
Type: ``int``, defaults to ``1024``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> bin_num = [8, 16, 32, 64, 128, 256,
... 512, 1024, 2048, 4096]
>>> label = ["8", "16", "32", "64", "128",
... "256", "512", "1024", "2048", "4096"]
>>> for key, num in enumerate(bin_num):
# Insert integer for 'num' and 'key'
>>> airlines_drf = H2ORandomForestEstimator(nbins_cats=num,
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(label[key], 'training score',
... airlines_gbm.auc(train=True))
>>> print(label[key], 'validation score',
... airlines_gbm.auc(valid=True))
"""
return self._parms.get("nbins_cats")
@nbins_cats.setter
def nbins_cats(self, nbins_cats):
assert_is_type(nbins_cats, None, int)
self._parms["nbins_cats"] = nbins_cats
@property
def r2_stopping(self):
"""
r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and
stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or
exceeds this
Type: ``float``, defaults to ``∞``.
"""
return self._parms.get("r2_stopping")
@r2_stopping.setter
def r2_stopping(self, r2_stopping):
assert_is_type(r2_stopping, None, numeric)
self._parms["r2_stopping"] = r2_stopping
@property
def stopping_rounds(self):
"""
Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
Type: ``int``, defaults to ``0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_drf.auc(valid=True)
"""
return self._parms.get("stopping_rounds")
@stopping_rounds.setter
def stopping_rounds(self, stopping_rounds):
assert_is_type(stopping_rounds, None, int)
self._parms["stopping_rounds"] = stopping_rounds
@property
def stopping_metric(self):
"""
Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score
for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python
client.
Type: ``Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group",
"misclassification", "mean_per_class_error", "custom", "custom_increasing"]``, defaults to ``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_drf.auc(valid=True)
"""
return self._parms.get("stopping_metric")
@stopping_metric.setter
def stopping_metric(self, stopping_metric):
assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"))
self._parms["stopping_metric"] = stopping_metric
@property
def stopping_tolerance(self):
"""
Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)
Type: ``float``, defaults to ``0.001``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc",
... stopping_rounds=3,
... stopping_tolerance=1e-2,
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_drf.auc(valid=True)
"""
return self._parms.get("stopping_tolerance")
@stopping_tolerance.setter
def stopping_tolerance(self, stopping_tolerance):
assert_is_type(stopping_tolerance, None, numeric)
self._parms["stopping_tolerance"] = stopping_tolerance
@property
def max_runtime_secs(self):
"""
Maximum allowed runtime in seconds for model training. Use 0 to disable.
Type: ``float``, defaults to ``0.0``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(max_runtime_secs=10,
... ntrees=10000,
... max_depth=10,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_drf.auc(valid = True)
"""
return self._parms.get("max_runtime_secs")
@max_runtime_secs.setter
def max_runtime_secs(self, max_runtime_secs):
assert_is_type(max_runtime_secs, None, numeric)
self._parms["max_runtime_secs"] = max_runtime_secs
@property
def seed(self):
"""
Seed for pseudo random number generator (if applicable)
Type: ``int``, defaults to ``-1``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> drf_w_seed_1 = H2ORandomForestEstimator(seed=1234)
>>> drf_w_seed_1.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('auc for the 1st model build with a seed:',
... drf_w_seed_1.auc(valid=True))
"""
return self._parms.get("seed")
@seed.setter
def seed(self, seed):
assert_is_type(seed, None, int)
self._parms["seed"] = seed
@property
def build_tree_one_node(self):
"""
Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(build_tree_one_node=True,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_drf.auc(valid=True)
"""
return self._parms.get("build_tree_one_node")
@build_tree_one_node.setter
def build_tree_one_node(self, build_tree_one_node):
assert_is_type(build_tree_one_node, None, bool)
self._parms["build_tree_one_node"] = build_tree_one_node
@property
def mtries(self):
"""
Number of variables randomly sampled as candidates at each split. If set to -1, defaults to sqrt{p} for
classification and p/3 for regression (where p is the # of predictors
Type: ``int``, defaults to ``-1``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8], seed=1234)
>>> cov_drf = H2ORandomForestEstimator(mtries=30, seed=1234)
>>> cov_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
"""
return self._parms.get("mtries")
@mtries.setter
def mtries(self, mtries):
assert_is_type(mtries, None, int)
self._parms["mtries"] = mtries
@property
def sample_rate(self):
"""
Row sample rate per tree (from 0.0 to 1.0)
Type: ``float``, defaults to ``0.632``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8],
... seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(sample_rate=.7,
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_drf.auc(valid=True))
"""
return self._parms.get("sample_rate")
@sample_rate.setter
def sample_rate(self, sample_rate):
assert_is_type(sample_rate, None, numeric)
self._parms["sample_rate"] = sample_rate
@property
def sample_rate_per_class(self):
"""
A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree
Type: ``List[float]``.
:examples:
>>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")
>>> covtype[54] = covtype[54].asfactor()
>>> predictors = covtype.columns[0:54]
>>> response = 'C55'
>>> train, valid = covtype.split_frame(ratios=[.8],
... seed=1234)
>>> print(train[response].table())
>>> rate_per_class_list = [1, .4, 1, 1, 1, 1, 1]
>>> cov_drf = H2ORandomForestEstimator(sample_rate_per_class=rate_per_class_list,
... seed=1234)
>>> cov_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('logloss', cov_drf.logloss(valid=True))
"""
return self._parms.get("sample_rate_per_class")
@sample_rate_per_class.setter
def sample_rate_per_class(self, sample_rate_per_class):
assert_is_type(sample_rate_per_class, None, [numeric])
self._parms["sample_rate_per_class"] = sample_rate_per_class
@property
def binomial_double_trees(self):
"""
For binary classification: Build 2x as many trees (one per class) - can lead to higher accuracy.
Type: ``bool``, defaults to ``False``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(binomial_double_trees=False,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('without binomial_double_trees:',
... cars_drf.auc(valid=True))
>>> cars_drf_2 = H2ORandomForestEstimator(binomial_double_trees=True,
... seed=1234)
>>> cars_drf_2.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print('with binomial_double_trees:', cars_drf_2.auc(valid=True))
"""
return self._parms.get("binomial_double_trees")
@binomial_double_trees.setter
def binomial_double_trees(self, binomial_double_trees):
assert_is_type(binomial_double_trees, None, bool)
self._parms["binomial_double_trees"] = binomial_double_trees
@property
def checkpoint(self):
"""
Model checkpoint to resume training with.
Type: ``Union[None, str, H2OEstimator]``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8],
... seed=1234)
>>> cars_drf = H2ORandomForestEstimator(ntrees=1,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cars_drf.auc(valid=True))
"""
return self._parms.get("checkpoint")
@checkpoint.setter
def checkpoint(self, checkpoint):
assert_is_type(checkpoint, None, str, H2OEstimator)
self._parms["checkpoint"] = checkpoint
@property
def col_sample_rate_change_per_level(self):
"""
Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(col_sample_rate_change_per_level=.9,
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_drf.auc(valid=True))
"""
return self._parms.get("col_sample_rate_change_per_level")
@col_sample_rate_change_per_level.setter
def col_sample_rate_change_per_level(self, col_sample_rate_change_per_level):
assert_is_type(col_sample_rate_change_per_level, None, numeric)
self._parms["col_sample_rate_change_per_level"] = col_sample_rate_change_per_level
@property
def col_sample_rate_per_tree(self):
"""
Column sample rate per tree (from 0.0 to 1.0)
Type: ``float``, defaults to ``1.0``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(col_sample_rate_per_tree=.7,
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_drf.auc(valid=True))
"""
return self._parms.get("col_sample_rate_per_tree")
@col_sample_rate_per_tree.setter
def col_sample_rate_per_tree(self, col_sample_rate_per_tree):
assert_is_type(col_sample_rate_per_tree, None, numeric)
self._parms["col_sample_rate_per_tree"] = col_sample_rate_per_tree
@property
def min_split_improvement(self):
"""
Minimum relative improvement in squared error reduction for a split to happen
Type: ``float``, defaults to ``1e-05``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(min_split_improvement=1e-3,
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(cars_drf.auc(valid=True))
"""
return self._parms.get("min_split_improvement")
@min_split_improvement.setter
def min_split_improvement(self, min_split_improvement):
assert_is_type(min_split_improvement, None, numeric)
self._parms["min_split_improvement"] = min_split_improvement
@property
def histogram_type(self):
"""
What type of histogram to use for finding optimal split points
Type: ``Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin"]``, defaults to
``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> airlines_drf = H2ORandomForestEstimator(histogram_type="UniformAdaptive",
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> print(airlines_drf.auc(valid=True))
"""
return self._parms.get("histogram_type")
@histogram_type.setter
def histogram_type(self, histogram_type):
assert_is_type(histogram_type, None, Enum("auto", "uniform_adaptive", "random", "quantiles_global", "round_robin"))
self._parms["histogram_type"] = histogram_type
@property
def categorical_encoding(self):
"""
Encoding scheme for categorical features
Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
"sort_by_response", "enum_limited"]``, defaults to ``"auto"``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
>>> airlines["Year"] = airlines["Year"].asfactor()
>>> airlines["Month"] = airlines["Month"].asfactor()
>>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
>>> airlines["Cancelled"] = airlines["Cancelled"].asfactor()
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
... "DayOfWeek", "Month", "Distance", "FlightNum"]
>>> response = "IsDepDelayed"
>>> train, valid= airlines.split_frame(ratios=[.8], seed=1234)
>>> encoding = "one_hot_explicit"
>>> airlines_drf = H2ORandomForestEstimator(categorical_encoding=encoding,
... seed=1234)
>>> airlines_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> airlines_drf.auc(valid=True)
"""
return self._parms.get("categorical_encoding")
@categorical_encoding.setter
def categorical_encoding(self, categorical_encoding):
assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
self._parms["categorical_encoding"] = categorical_encoding
@property
def calibrate_model(self):
"""
Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide more accurate estimates
of class probabilities.
Type: ``bool``, defaults to ``False``.
:examples:
>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> from h2o.estimators.random_forest import H2ORandomForestEstimator
>>> response = 'Angaus'
>>> predictors = ecology.columns[3:13]
>>> train, calib = ecology.split_frame(seed=12354)
>>> w = h2o.create_frame(binary_fraction=1,
... binary_ones_fraction=0.5,
... missing_fraction=0,
... rows=744, cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_drf = H2ORandomForestEstimator(ntrees=10,
... max_depth=5,
... min_rows=10,
... distribution="multinomial",
... weights_column="weight",
... calibrate_model=True,
... calibration_frame=calib)
>>> ecology_drf.train(x=predictors,
... y="Angaus",
... training_frame=train)
>>> predicted = ecology_drf.predict(calib)
"""
return self._parms.get("calibrate_model")
@calibrate_model.setter
def calibrate_model(self, calibrate_model):
assert_is_type(calibrate_model, None, bool)
self._parms["calibrate_model"] = calibrate_model
@property
def calibration_frame(self):
"""
Calibration frame for Platt Scaling
Type: ``Union[None, str, H2OFrame]``.
:examples:
>>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv")
>>> ecology['Angaus'] = ecology['Angaus'].asfactor()
>>> response = 'Angaus'
>>> predictors = ecology.columns[3:13]
>>> train, calib = ecology.split_frame(seed = 12354)
>>> w = h2o.create_frame(binary_fraction=1,
... binary_ones_fraction=0.5,
... missing_fraction=0,
... rows=744, cols=1)
>>> w.set_names(["weight"])
>>> train = train.cbind(w)
>>> ecology_drf = H2ORandomForestEstimator(ntrees=10,
... max_depth=5,
... min_rows=10,
... distribution="multinomial",
... calibrate_model=True,
... calibration_frame=calib)
>>> ecology_drf.train(x=predictors,
... y="Angaus,
... training_frame=train,
... weights_column="weight")
>>> predicted = ecology_drf.predict(train)
"""
return self._parms.get("calibration_frame")
@calibration_frame.setter
def calibration_frame(self, calibration_frame):
self._parms["calibration_frame"] = H2OFrame._validate(calibration_frame, 'calibration_frame')
@property
def distribution(self):
"""
Distribution function
Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
"quantile", "huber"]``, defaults to ``"auto"``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_drf = H2ORandomForestEstimator(distribution="poisson",
... seed=1234)
>>> cars_drf.train(x=predictors,
... y=response,
... training_frame=train,
... validation_frame=valid)
>>> cars_drf.mse(valid=True)
"""
return self._parms.get("distribution")
@distribution.setter
def distribution(self, distribution):
assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"))
self._parms["distribution"] = distribution
@property
def custom_metric_func(self):
"""
Reference to custom evaluation function, format: `language:keyName=funcName`
Type: ``str``.
"""
return self._parms.get("custom_metric_func")
@custom_metric_func.setter
def custom_metric_func(self, custom_metric_func):
assert_is_type(custom_metric_func, None, str)
self._parms["custom_metric_func"] = custom_metric_func
@property
def export_checkpoints_dir(self):
"""
Automatically export generated models to this directory.
Type: ``str``.
:examples:
>>> import tempfile
>>> from os import listdir
>>> from h2o.grid.grid_search import H2OGridSearch
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> response = "IsDepDelayed"
>>> hyper_parameters = {'ntrees': [5,10]}
>>> search_crit = {'strategy': "RandomDiscrete",
... 'max_models': 5,
... 'seed': 1234,
... 'stopping_rounds': 3,
... 'stopping_metric': "AUTO",
... 'stopping_tolerance': 1e-2}
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_grid = H2OGridSearch(H2ORandomForestEstimator,
... hyper_params=hyper_parameters,
... search_criteria=search_crit)
>>> air_grid.train(x=predictors,
... y=response,
... training_frame=airlines,
... distribution="bernoulli",
... max_depth=3,
... export_checkpoints_dir=checkpoints_dir)
>>> num_files = len(listdir(checkpoints_dir))
>>> num_files
"""
return self._parms.get("export_checkpoints_dir")
@export_checkpoints_dir.setter
def export_checkpoints_dir(self, export_checkpoints_dir):
assert_is_type(export_checkpoints_dir, None, str)
self._parms["export_checkpoints_dir"] = export_checkpoints_dir
@property
def check_constant_response(self):
"""
Check if response column is constant. If enabled, then an exception is thrown if the response column is a
constant value.If disabled, then model will train regardless of the response column being a constant value or
not.
Type: ``bool``, defaults to ``True``.
:examples:
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> train["constantCol"] = 1
>>> my_drf = H2ORandomForestEstimator(check_constant_response=False)
>>> my_drf.train(x=list(range(1,5)),
... y="constantCol",
... training_frame=train)
"""
return self._parms.get("check_constant_response")
@check_constant_response.setter
def check_constant_response(self, check_constant_response):
assert_is_type(check_constant_response, None, bool)
self._parms["check_constant_response"] = check_constant_response
@property
def gainslift_bins(self):
"""
Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning.
Type: ``int``, defaults to ``-1``.
:examples:
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv")
>>> model = H2ORandomForestEstimator(ntrees=1, gainslift_bins=20)
>>> model.train(x=["Origin", "Distance"],
... y="IsDepDelayed",
... training_frame=airlines)
>>> model.gains_lift()
"""
return self._parms.get("gainslift_bins")
@gainslift_bins.setter
def gainslift_bins(self, gainslift_bins):
assert_is_type(gainslift_bins, None, int)
self._parms["gainslift_bins"] = gainslift_bins
@property
def auc_type(self):
"""
Set default multinomial AUC type.
Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to
``"auto"``.
"""
return self._parms.get("auc_type")
@auc_type.setter
def auc_type(self, auc_type):
assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"))
self._parms["auc_type"] = auc_type
offset_column = deprecated_property('offset_column', None)