#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details)
#
from .estimator_base import H2OEstimator
[docs]class H2ORandomForestEstimator(H2OEstimator):
"""
Distributed Random Forest
Parameters
----------
model_id : str
Destination id for this model; auto-generated if not specified.
training_frame : str
Id of the training data frame (Not required, to allow initial validation of model parameters).
validation_frame : str
Id of the validation data frame.
nfolds : int
Number of folds for N-fold cross-validation (0 to disable or ≥ 2).
Default: 0
keep_cross_validation_predictions : bool
Whether to keep the predictions of the cross-validation models.
Default: False
keep_cross_validation_fold_assignment : bool
Whether to keep the cross-validation fold assignment.
Default: False
score_each_iteration : bool
Whether to score during each iteration of model training.
Default: False
score_tree_interval : int
Score the model after every so many trees. Disabled if set to 0.
Default: 0
fold_assignment : "AUTO" | "Random" | "Modulo" | "Stratified"
Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
the folds based on the response variable, for classification problems.
Default: "AUTO"
fold_column : VecSpecifier
Column with cross-validation fold index assignment per observation.
response_column : VecSpecifier
Response variable column.
ignored_columns : list(str)
Names of columns to ignore for training.
ignore_const_cols : bool
Ignore constant columns.
Default: True
offset_column : VecSpecifier
Offset column. This will be added to the combination of columns before applying the link function.
weights_column : VecSpecifier
Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
weights are not allowed.
balance_classes : bool
Balance training data class counts via over/under-sampling (for imbalanced data).
Default: False
class_sampling_factors : list(float)
Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
be automatically computed to obtain class balance during training. Requires balance_classes.
max_after_balance_size : float
Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires
balance_classes.
Default: 5.0
max_confusion_matrix_size : int
Maximum size (# classes) for confusion matrices to be printed in the Logs
Default: 20
max_hit_ratio_k : int
Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)
Default: 0
ntrees : int
Number of trees.
Default: 50
max_depth : int
Maximum tree depth.
Default: 20
min_rows : float
Fewest allowed (weighted) observations in a leaf (in R called 'nodesize').
Default: 1.0
nbins : int
For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point
Default: 20
nbins_top_level : int
For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease
by factor of two per level
Default: 1024
nbins_cats : int
For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher
values can lead to more overfitting.
Default: 1024
r2_stopping : float
Stop making trees when the R^2 metric equals or exceeds this
Default: 1.79769313486e+308
stopping_rounds : int
Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
Default: 0
stopping_metric : "AUTO" | "deviance" | "logloss" | "MSE" | "AUC" | "lift_top_group" | "r2" | "misclassification"
| "mean_per_class_error"
Metric to use for early stopping (AUTO: logloss for classification, deviance for regression)
Default: "AUTO"
stopping_tolerance : float
Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)
Default: 0.001
max_runtime_secs : float
Maximum allowed runtime in seconds for model training. Use 0 to disable.
Default: 0.0
seed : int
Seed for pseudo random number generator (if applicable)
Default: -1
build_tree_one_node : bool
Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.
Default: False
mtries : int
Number of variables randomly sampled as candidates at each split. If set to -1, defaults to sqrt{p} for
classification and p/3 for regression (where p is the # of predictors
Default: -1
sample_rate : float
Row sample rate per tree (from 0.0 to 1.0)
Default: 0.632000029087
sample_rate_per_class : list(float)
Row sample rate per tree per class (from 0.0 to 1.0)
binomial_double_trees : bool
For binary classification: Build 2x as many trees (one per class) - can lead to higher accuracy.
Default: False
checkpoint : str
Model checkpoint to resume training with.
col_sample_rate_change_per_level : float
Relative change of the column sampling rate for every level (from 0.0 to 2.0)
Default: 1.0
col_sample_rate_per_tree : float
Column sample rate per tree (from 0.0 to 1.0)
Default: 1.0
min_split_improvement : float
Minimum relative improvement in squared error reduction for a split to happen
Default: 1e-05
histogram_type : "AUTO" | "UniformAdaptive" | "Random" | "QuantilesGlobal" | "RoundRobin"
What type of histogram to use for finding optimal split points
Default: "AUTO"
"""
def __init__(self, **kwargs):
super(H2ORandomForestEstimator, self).__init__()
self._parms = {}
for name in ["model_id", "training_frame", "validation_frame", "nfolds", "keep_cross_validation_predictions",
"keep_cross_validation_fold_assignment", "score_each_iteration", "score_tree_interval",
"fold_assignment", "fold_column", "response_column", "ignored_columns", "ignore_const_cols",
"offset_column", "weights_column", "balance_classes", "class_sampling_factors",
"max_after_balance_size", "max_confusion_matrix_size", "max_hit_ratio_k", "ntrees", "max_depth",
"min_rows", "nbins", "nbins_top_level", "nbins_cats", "r2_stopping", "stopping_rounds",
"stopping_metric", "stopping_tolerance", "max_runtime_secs", "seed", "build_tree_one_node",
"mtries", "sample_rate", "sample_rate_per_class", "binomial_double_trees", "checkpoint",
"col_sample_rate_change_per_level", "col_sample_rate_per_tree", "min_split_improvement",
"histogram_type"]:
pname = name[:-1] if name[-1] == '_' else name
self._parms[pname] = kwargs[name] if name in kwargs else None
@property
def training_frame(self):
return self._parms["training_frame"]
@training_frame.setter
def training_frame(self, value):
self._parms["training_frame"] = value
@property
def validation_frame(self):
return self._parms["validation_frame"]
@validation_frame.setter
def validation_frame(self, value):
self._parms["validation_frame"] = value
@property
def nfolds(self):
return self._parms["nfolds"]
@nfolds.setter
def nfolds(self, value):
self._parms["nfolds"] = value
@property
def keep_cross_validation_predictions(self):
return self._parms["keep_cross_validation_predictions"]
@keep_cross_validation_predictions.setter
def keep_cross_validation_predictions(self, value):
self._parms["keep_cross_validation_predictions"] = value
@property
def keep_cross_validation_fold_assignment(self):
return self._parms["keep_cross_validation_fold_assignment"]
@keep_cross_validation_fold_assignment.setter
def keep_cross_validation_fold_assignment(self, value):
self._parms["keep_cross_validation_fold_assignment"] = value
@property
def score_each_iteration(self):
return self._parms["score_each_iteration"]
@score_each_iteration.setter
def score_each_iteration(self, value):
self._parms["score_each_iteration"] = value
@property
def score_tree_interval(self):
return self._parms["score_tree_interval"]
@score_tree_interval.setter
def score_tree_interval(self, value):
self._parms["score_tree_interval"] = value
@property
def fold_assignment(self):
return self._parms["fold_assignment"]
@fold_assignment.setter
def fold_assignment(self, value):
self._parms["fold_assignment"] = value
@property
def fold_column(self):
return self._parms["fold_column"]
@fold_column.setter
def fold_column(self, value):
self._parms["fold_column"] = value
@property
def response_column(self):
return self._parms["response_column"]
@response_column.setter
def response_column(self, value):
self._parms["response_column"] = value
@property
def ignored_columns(self):
return self._parms["ignored_columns"]
@ignored_columns.setter
def ignored_columns(self, value):
self._parms["ignored_columns"] = value
@property
def ignore_const_cols(self):
return self._parms["ignore_const_cols"]
@ignore_const_cols.setter
def ignore_const_cols(self, value):
self._parms["ignore_const_cols"] = value
@property
def offset_column(self):
return self._parms["offset_column"]
@offset_column.setter
def offset_column(self, value):
self._parms["offset_column"] = value
@property
def weights_column(self):
return self._parms["weights_column"]
@weights_column.setter
def weights_column(self, value):
self._parms["weights_column"] = value
@property
def balance_classes(self):
return self._parms["balance_classes"]
@balance_classes.setter
def balance_classes(self, value):
self._parms["balance_classes"] = value
@property
def class_sampling_factors(self):
return self._parms["class_sampling_factors"]
@class_sampling_factors.setter
def class_sampling_factors(self, value):
self._parms["class_sampling_factors"] = value
@property
def max_after_balance_size(self):
return self._parms["max_after_balance_size"]
@max_after_balance_size.setter
def max_after_balance_size(self, value):
self._parms["max_after_balance_size"] = value
@property
def max_confusion_matrix_size(self):
return self._parms["max_confusion_matrix_size"]
@max_confusion_matrix_size.setter
def max_confusion_matrix_size(self, value):
self._parms["max_confusion_matrix_size"] = value
@property
def max_hit_ratio_k(self):
return self._parms["max_hit_ratio_k"]
@max_hit_ratio_k.setter
def max_hit_ratio_k(self, value):
self._parms["max_hit_ratio_k"] = value
@property
def ntrees(self):
return self._parms["ntrees"]
@ntrees.setter
def ntrees(self, value):
self._parms["ntrees"] = value
@property
def max_depth(self):
return self._parms["max_depth"]
@max_depth.setter
def max_depth(self, value):
self._parms["max_depth"] = value
@property
def min_rows(self):
return self._parms["min_rows"]
@min_rows.setter
def min_rows(self, value):
self._parms["min_rows"] = value
@property
def nbins(self):
return self._parms["nbins"]
@nbins.setter
def nbins(self, value):
self._parms["nbins"] = value
@property
def nbins_top_level(self):
return self._parms["nbins_top_level"]
@nbins_top_level.setter
def nbins_top_level(self, value):
self._parms["nbins_top_level"] = value
@property
def nbins_cats(self):
return self._parms["nbins_cats"]
@nbins_cats.setter
def nbins_cats(self, value):
self._parms["nbins_cats"] = value
@property
def r2_stopping(self):
return self._parms["r2_stopping"]
@r2_stopping.setter
def r2_stopping(self, value):
self._parms["r2_stopping"] = value
@property
def stopping_rounds(self):
return self._parms["stopping_rounds"]
@stopping_rounds.setter
def stopping_rounds(self, value):
self._parms["stopping_rounds"] = value
@property
def stopping_metric(self):
return self._parms["stopping_metric"]
@stopping_metric.setter
def stopping_metric(self, value):
self._parms["stopping_metric"] = value
@property
def stopping_tolerance(self):
return self._parms["stopping_tolerance"]
@stopping_tolerance.setter
def stopping_tolerance(self, value):
self._parms["stopping_tolerance"] = value
@property
def max_runtime_secs(self):
return self._parms["max_runtime_secs"]
@max_runtime_secs.setter
def max_runtime_secs(self, value):
self._parms["max_runtime_secs"] = value
@property
def seed(self):
return self._parms["seed"]
@seed.setter
def seed(self, value):
self._parms["seed"] = value
@property
def build_tree_one_node(self):
return self._parms["build_tree_one_node"]
@build_tree_one_node.setter
def build_tree_one_node(self, value):
self._parms["build_tree_one_node"] = value
@property
def mtries(self):
return self._parms["mtries"]
@mtries.setter
def mtries(self, value):
self._parms["mtries"] = value
@property
def sample_rate(self):
return self._parms["sample_rate"]
@sample_rate.setter
def sample_rate(self, value):
self._parms["sample_rate"] = value
@property
def sample_rate_per_class(self):
return self._parms["sample_rate_per_class"]
@sample_rate_per_class.setter
def sample_rate_per_class(self, value):
self._parms["sample_rate_per_class"] = value
@property
def binomial_double_trees(self):
return self._parms["binomial_double_trees"]
@binomial_double_trees.setter
def binomial_double_trees(self, value):
self._parms["binomial_double_trees"] = value
@property
def checkpoint(self):
return self._parms["checkpoint"]
@checkpoint.setter
def checkpoint(self, value):
self._parms["checkpoint"] = value
@property
def col_sample_rate_change_per_level(self):
return self._parms["col_sample_rate_change_per_level"]
@col_sample_rate_change_per_level.setter
def col_sample_rate_change_per_level(self, value):
self._parms["col_sample_rate_change_per_level"] = value
@property
def col_sample_rate_per_tree(self):
return self._parms["col_sample_rate_per_tree"]
@col_sample_rate_per_tree.setter
def col_sample_rate_per_tree(self, value):
self._parms["col_sample_rate_per_tree"] = value
@property
def min_split_improvement(self):
return self._parms["min_split_improvement"]
@min_split_improvement.setter
def min_split_improvement(self, value):
self._parms["min_split_improvement"] = value
@property
def histogram_type(self):
return self._parms["histogram_type"]
@histogram_type.setter
def histogram_type(self, value):
self._parms["histogram_type"] = value