Source code for h2o.estimators.random_forest

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals

from h2o.utils.metaclass import deprecated_params, deprecated_property
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2ORandomForestEstimator(H2OEstimator): """ Distributed Random Forest Builds a Distributed Random Forest (DRF) on a parsed dataset, for regression or classification. """ algo = "drf" supervised_learning = True _options_ = {'model_extensions': ['h2o.model.extensions.ScoringHistoryTrees', 'h2o.model.extensions.VariableImportance', 'h2o.model.extensions.Trees', 'h2o.model.extensions.SupervisedTrees'], 'verbose': True} @deprecated_params({'offset_column': None}) def __init__(self, model_id=None, # type: Optional[Union[None, str, H2OEstimator]] training_frame=None, # type: Optional[Union[None, str, H2OFrame]] validation_frame=None, # type: Optional[Union[None, str, H2OFrame]] nfolds=0, # type: int keep_cross_validation_models=True, # type: bool keep_cross_validation_predictions=False, # type: bool keep_cross_validation_fold_assignment=False, # type: bool score_each_iteration=False, # type: bool score_tree_interval=0, # type: int fold_assignment="auto", # type: Literal["auto", "random", "modulo", "stratified"] fold_column=None, # type: Optional[str] response_column=None, # type: Optional[str] ignored_columns=None, # type: Optional[List[str]] ignore_const_cols=True, # type: bool weights_column=None, # type: Optional[str] balance_classes=False, # type: bool class_sampling_factors=None, # type: Optional[List[float]] max_after_balance_size=5.0, # type: float max_confusion_matrix_size=20, # type: int ntrees=50, # type: int max_depth=20, # type: int min_rows=1.0, # type: float nbins=20, # type: int nbins_top_level=1024, # type: int nbins_cats=1024, # type: int r2_stopping=None, # type: Optional[float] stopping_rounds=0, # type: int stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"] stopping_tolerance=0.001, # type: float max_runtime_secs=0.0, # type: float seed=-1, # type: int build_tree_one_node=False, # type: bool mtries=-1, # type: int sample_rate=0.632, # type: float sample_rate_per_class=None, # type: Optional[List[float]] binomial_double_trees=False, # type: bool checkpoint=None, # type: Optional[Union[None, str, H2OEstimator]] col_sample_rate_change_per_level=1.0, # type: float col_sample_rate_per_tree=1.0, # type: float min_split_improvement=1e-05, # type: float histogram_type="auto", # type: Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin"] categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] calibrate_model=False, # type: bool calibration_frame=None, # type: Optional[Union[None, str, H2OFrame]] distribution="auto", # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"] custom_metric_func=None, # type: Optional[str] export_checkpoints_dir=None, # type: Optional[str] check_constant_response=True, # type: bool gainslift_bins=-1, # type: int auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] ): """ :param model_id: Destination id for this model; auto-generated if not specified. Defaults to ``None``. :type model_id: Union[None, str, H2OEstimator], optional :param training_frame: Id of the training data frame. Defaults to ``None``. :type training_frame: Union[None, str, H2OFrame], optional :param validation_frame: Id of the validation data frame. Defaults to ``None``. :type validation_frame: Union[None, str, H2OFrame], optional :param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2). Defaults to ``0``. :type nfolds: int :param keep_cross_validation_models: Whether to keep the cross-validation models. Defaults to ``True``. :type keep_cross_validation_models: bool :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models. Defaults to ``False``. :type keep_cross_validation_predictions: bool :param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment. Defaults to ``False``. :type keep_cross_validation_fold_assignment: bool :param score_each_iteration: Whether to score during each iteration of model training. Defaults to ``False``. :type score_each_iteration: bool :param score_tree_interval: Score the model after every so many trees. Disabled if set to 0. Defaults to ``0``. :type score_tree_interval: int :param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Defaults to ``"auto"``. :type fold_assignment: Literal["auto", "random", "modulo", "stratified"] :param fold_column: Column with cross-validation fold index assignment per observation. Defaults to ``None``. :type fold_column: str, optional :param response_column: Response variable column. Defaults to ``None``. :type response_column: str, optional :param ignored_columns: Names of columns to ignore for training. Defaults to ``None``. :type ignored_columns: List[str], optional :param ignore_const_cols: Ignore constant columns. Defaults to ``True``. :type ignore_const_cols: bool :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Defaults to ``None``. :type weights_column: str, optional :param balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data). Defaults to ``False``. :type balance_classes: bool :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes. Defaults to ``None``. :type class_sampling_factors: List[float], optional :param max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes. Defaults to ``5.0``. :type max_after_balance_size: float :param max_confusion_matrix_size: [Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs Defaults to ``20``. :type max_confusion_matrix_size: int :param ntrees: Number of trees. Defaults to ``50``. :type ntrees: int :param max_depth: Maximum tree depth (0 for unlimited). Defaults to ``20``. :type max_depth: int :param min_rows: Fewest allowed (weighted) observations in a leaf. Defaults to ``1.0``. :type min_rows: float :param nbins: For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point Defaults to ``20``. :type nbins: int :param nbins_top_level: For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level Defaults to ``1024``. :type nbins_top_level: int :param nbins_cats: For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting. Defaults to ``1024``. :type nbins_cats: int :param r2_stopping: r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or exceeds this Defaults to ``∞``. :type r2_stopping: float :param stopping_rounds: Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Defaults to ``0``. :type stopping_rounds: int :param stopping_metric: Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. Defaults to ``"auto"``. :type stopping_metric: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"] :param stopping_tolerance: Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Defaults to ``0.001``. :type stopping_tolerance: float :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to ``0.0``. :type max_runtime_secs: float :param seed: Seed for pseudo random number generator (if applicable) Defaults to ``-1``. :type seed: int :param build_tree_one_node: Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets. Defaults to ``False``. :type build_tree_one_node: bool :param mtries: Number of variables randomly sampled as candidates at each split. If set to -1, defaults to sqrt{p} for classification and p/3 for regression (where p is the # of predictors Defaults to ``-1``. :type mtries: int :param sample_rate: Row sample rate per tree (from 0.0 to 1.0) Defaults to ``0.632``. :type sample_rate: float :param sample_rate_per_class: A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree Defaults to ``None``. :type sample_rate_per_class: List[float], optional :param binomial_double_trees: For binary classification: Build 2x as many trees (one per class) - can lead to higher accuracy. Defaults to ``False``. :type binomial_double_trees: bool :param checkpoint: Model checkpoint to resume training with. Defaults to ``None``. :type checkpoint: Union[None, str, H2OEstimator], optional :param col_sample_rate_change_per_level: Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0) Defaults to ``1.0``. :type col_sample_rate_change_per_level: float :param col_sample_rate_per_tree: Column sample rate per tree (from 0.0 to 1.0) Defaults to ``1.0``. :type col_sample_rate_per_tree: float :param min_split_improvement: Minimum relative improvement in squared error reduction for a split to happen Defaults to ``1e-05``. :type min_split_improvement: float :param histogram_type: What type of histogram to use for finding optimal split points Defaults to ``"auto"``. :type histogram_type: Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin"] :param categorical_encoding: Encoding scheme for categorical features Defaults to ``"auto"``. :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] :param calibrate_model: Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities. Defaults to ``False``. :type calibrate_model: bool :param calibration_frame: Calibration frame for Platt Scaling Defaults to ``None``. :type calibration_frame: Union[None, str, H2OFrame], optional :param distribution: Distribution function Defaults to ``"auto"``. :type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"] :param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName` Defaults to ``None``. :type custom_metric_func: str, optional :param export_checkpoints_dir: Automatically export generated models to this directory. Defaults to ``None``. :type export_checkpoints_dir: str, optional :param check_constant_response: Check if response column is constant. If enabled, then an exception is thrown if the response column is a constant value.If disabled, then model will train regardless of the response column being a constant value or not. Defaults to ``True``. :type check_constant_response: bool :param gainslift_bins: Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning. Defaults to ``-1``. :type gainslift_bins: int :param auc_type: Set default multinomial AUC type. Defaults to ``"auto"``. :type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] """ super(H2ORandomForestEstimator, self).__init__() self._parms = {} self._id = self._parms['model_id'] = model_id self.training_frame = training_frame self.validation_frame = validation_frame self.nfolds = nfolds self.keep_cross_validation_models = keep_cross_validation_models self.keep_cross_validation_predictions = keep_cross_validation_predictions self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment self.score_each_iteration = score_each_iteration self.score_tree_interval = score_tree_interval self.fold_assignment = fold_assignment self.fold_column = fold_column self.response_column = response_column self.ignored_columns = ignored_columns self.ignore_const_cols = ignore_const_cols self.weights_column = weights_column self.balance_classes = balance_classes self.class_sampling_factors = class_sampling_factors self.max_after_balance_size = max_after_balance_size self.max_confusion_matrix_size = max_confusion_matrix_size self.ntrees = ntrees self.max_depth = max_depth self.min_rows = min_rows self.nbins = nbins self.nbins_top_level = nbins_top_level self.nbins_cats = nbins_cats self.r2_stopping = r2_stopping self.stopping_rounds = stopping_rounds self.stopping_metric = stopping_metric self.stopping_tolerance = stopping_tolerance self.max_runtime_secs = max_runtime_secs self.seed = seed self.build_tree_one_node = build_tree_one_node self.mtries = mtries self.sample_rate = sample_rate self.sample_rate_per_class = sample_rate_per_class self.binomial_double_trees = binomial_double_trees self.checkpoint = checkpoint self.col_sample_rate_change_per_level = col_sample_rate_change_per_level self.col_sample_rate_per_tree = col_sample_rate_per_tree self.min_split_improvement = min_split_improvement self.histogram_type = histogram_type self.categorical_encoding = categorical_encoding self.calibrate_model = calibrate_model self.calibration_frame = calibration_frame self.distribution = distribution self.custom_metric_func = custom_metric_func self.export_checkpoints_dir = export_checkpoints_dir self.check_constant_response = check_constant_response self.gainslift_bins = gainslift_bins self.auc_type = auc_type @property def training_frame(self): """ Id of the training data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], ... seed=1234) >>> cars_drf = H2ORandomForestEstimator(seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_drf.auc(valid=True) """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def validation_frame(self): """ Id of the validation data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], ... seed=1234) >>> cars_drf = H2ORandomForestEstimator(seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_drf.auc(valid=True) """ return self._parms.get("validation_frame") @validation_frame.setter def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') @property def nfolds(self): """ Number of folds for K-fold cross-validation (0 to disable or >= 2). Type: ``int``, defaults to ``0``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> folds = 5 >>> cars_drf = H2ORandomForestEstimator(nfolds=folds, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=cars) >>> cars_drf.auc(xval=True) """ return self._parms.get("nfolds") @nfolds.setter def nfolds(self, nfolds): assert_is_type(nfolds, None, int) self._parms["nfolds"] = nfolds @property def keep_cross_validation_models(self): """ Whether to keep the cross-validation models. Type: ``bool``, defaults to ``True``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_models=True, ... nfolds=5, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train) >>> cars_drf.auc() """ return self._parms.get("keep_cross_validation_models") @keep_cross_validation_models.setter def keep_cross_validation_models(self, keep_cross_validation_models): assert_is_type(keep_cross_validation_models, None, bool) self._parms["keep_cross_validation_models"] = keep_cross_validation_models @property def keep_cross_validation_predictions(self): """ Whether to keep the predictions of the cross-validation models. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_predictions=True, ... nfolds=5, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train) >>> cars_drf.cross_validation_predictions() """ return self._parms.get("keep_cross_validation_predictions") @keep_cross_validation_predictions.setter def keep_cross_validation_predictions(self, keep_cross_validation_predictions): assert_is_type(keep_cross_validation_predictions, None, bool) self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions @property def keep_cross_validation_fold_assignment(self): """ Whether to keep the cross-validation fold assignment. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(keep_cross_validation_fold_assignment=True, ... nfolds=5, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train) >>> cars_drf.cross_validation_fold_assignment() """ return self._parms.get("keep_cross_validation_fold_assignment") @keep_cross_validation_fold_assignment.setter def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment): assert_is_type(keep_cross_validation_fold_assignment, None, bool) self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment @property def score_each_iteration(self): """ Whether to score during each iteration of model training. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(score_each_iteration=True, ... ntrees=55, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame = valid) >>> cars_drf.scoring_history() """ return self._parms.get("score_each_iteration") @score_each_iteration.setter def score_each_iteration(self, score_each_iteration): assert_is_type(score_each_iteration, None, bool) self._parms["score_each_iteration"] = score_each_iteration @property def score_tree_interval(self): """ Score the model after every so many trees. Disabled if set to 0. Type: ``int``, defaults to ``0``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(score_tree_interval=5, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_drf.scoring_history() """ return self._parms.get("score_tree_interval") @score_tree_interval.setter def score_tree_interval(self, score_tree_interval): assert_is_type(score_tree_interval, None, int) self._parms["score_tree_interval"] = score_tree_interval @property def fold_assignment(self): """ Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> assignment_type = "Random" >>> cars_drf = H2ORandomForestEstimator(fold_assignment=assignment_type, ... nfolds=5, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=cars) >>> cars_drf.auc(xval=True) """ return self._parms.get("fold_assignment") @fold_assignment.setter def fold_assignment(self, fold_assignment): assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified")) self._parms["fold_assignment"] = fold_assignment @property def fold_column(self): """ Column with cross-validation fold index assignment per observation. Type: ``str``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234) >>> fold_numbers.set_names(["fold_numbers"]) >>> cars = cars.cbind(fold_numbers) >>> print(cars['fold_numbers']) >>> cars_drf = H2ORandomForestEstimator(seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=cars, ... fold_column="fold_numbers") >>> cars_drf.auc(xval=True) """ return self._parms.get("fold_column") @fold_column.setter def fold_column(self, fold_column): assert_is_type(fold_column, None, str) self._parms["fold_column"] = fold_column @property def response_column(self): """ Response variable column. Type: ``str``. """ return self._parms.get("response_column") @response_column.setter def response_column(self, response_column): assert_is_type(response_column, None, str) self._parms["response_column"] = response_column @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def ignore_const_cols(self): """ Ignore constant columns. Type: ``bool``, defaults to ``True``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> cars["const_1"] = 6 >>> cars["const_2"] = 7 >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(seed=1234, ... ignore_const_cols=True) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_drf.auc(valid=True) """ return self._parms.get("ignore_const_cols") @ignore_const_cols.setter def ignore_const_cols(self, ignore_const_cols): assert_is_type(ignore_const_cols, None, bool) self._parms["ignore_const_cols"] = ignore_const_cols @property def weights_column(self): """ Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Type: ``str``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], ... seed=1234) >>> cars_drf = H2ORandomForestEstimator(seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid, ... weights_column="weight") >>> cars_drf.auc(valid=True) """ return self._parms.get("weights_column") @weights_column.setter def weights_column(self, weights_column): assert_is_type(weights_column, None, str) self._parms["weights_column"] = weights_column @property def balance_classes(self): """ Balance training data class counts via over/under-sampling (for imbalanced data). Type: ``bool``, defaults to ``False``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> cov_drf = H2ORandomForestEstimator(balance_classes=True, ... seed=1234) >>> cov_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('logloss', cov_drf.logloss(valid=True)) """ return self._parms.get("balance_classes") @balance_classes.setter def balance_classes(self, balance_classes): assert_is_type(balance_classes, None, bool) self._parms["balance_classes"] = balance_classes @property def class_sampling_factors(self): """ Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes. Type: ``List[float]``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> print(covtype[54].table()) >>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.] >>> cov_drf = H2ORandomForestEstimator(balance_classes=True, ... class_sampling_factors=sample_factors, ... seed=1234) >>> cov_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('logloss', cov_drf.logloss(valid=True)) """ return self._parms.get("class_sampling_factors") @class_sampling_factors.setter def class_sampling_factors(self, class_sampling_factors): assert_is_type(class_sampling_factors, None, [float]) self._parms["class_sampling_factors"] = class_sampling_factors @property def max_after_balance_size(self): """ Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes. Type: ``float``, defaults to ``5.0``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> print(covtype[54].table()) >>> max = .85 >>> cov_drf = H2ORandomForestEstimator(balance_classes=True, ... max_after_balance_size=max, ... seed=1234) >>> cov_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('logloss', cov_drf.logloss(valid=True)) """ return self._parms.get("max_after_balance_size") @max_after_balance_size.setter def max_after_balance_size(self, max_after_balance_size): assert_is_type(max_after_balance_size, None, float) self._parms["max_after_balance_size"] = max_after_balance_size @property def max_confusion_matrix_size(self): """ [Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs Type: ``int``, defaults to ``20``. """ return self._parms.get("max_confusion_matrix_size") @max_confusion_matrix_size.setter def max_confusion_matrix_size(self, max_confusion_matrix_size): assert_is_type(max_confusion_matrix_size, None, int) self._parms["max_confusion_matrix_size"] = max_confusion_matrix_size @property def ntrees(self): """ Number of trees. Type: ``int``, defaults to ``50``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> tree_num = [20, 50, 80, 110, ... 140, 170, 200] >>> label = ["20", "50", "80", "110", ... "140", "170", "200"] >>> for key, num in enumerate(tree_num): # Input an integer for 'num' and 'key' >>> titanic_drf = H2ORandomForestEstimator(ntrees=num, ... seed=1234) >>> titanic_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(label[key], 'training score', ... titanic_drf.auc(train=True)) >>> print(label[key], 'validation score', ... titanic_drf.auc(valid=True)) """ return self._parms.get("ntrees") @ntrees.setter def ntrees(self, ntrees): assert_is_type(ntrees, None, int) self._parms["ntrees"] = ntrees @property def max_depth(self): """ Maximum tree depth (0 for unlimited). Type: ``int``, defaults to ``20``. :examples: >>> df = h2o.import_file(path = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> response = "survived" >>> df[response] = df[response].asfactor() >>> predictors = df.columns >>> del predictors[1:3] >>> train, valid, test = df.split_frame(ratios=[0.6,0.2], ... seed=1234, ... destination_frames= ... ['train.hex','valid.hex','test.hex']) >>> drf = H2ORandomForestEstimator() >>> drf.train(x=predictors, ... y=response, ... training_frame=train) >>> perf = drf.model_performance(valid) >>> print perf.auc() """ return self._parms.get("max_depth") @max_depth.setter def max_depth(self, max_depth): assert_is_type(max_depth, None, int) self._parms["max_depth"] = max_depth @property def min_rows(self): """ Fewest allowed (weighted) observations in a leaf. Type: ``float``, defaults to ``1.0``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(min_rows=16, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cars_drf.auc(valid=True)) """ return self._parms.get("min_rows") @min_rows.setter def min_rows(self, min_rows): assert_is_type(min_rows, None, numeric) self._parms["min_rows"] = min_rows @property def nbins(self): """ For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point Type: ``int``, defaults to ``20``. :examples: >>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv") >>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor() >>> predictors = eeg.columns[:-1] >>> response = 'eyeDetection' >>> train, valid = eeg.split_frame(ratios=[.8], seed=1234) >>> bin_num = [16, 32, 64, 128, 256, 512] >>> label = ["16", "32", "64", "128", "256", "512"] >>> for key, num in enumerate(bin_num): # Insert integer for 'num' and 'key' >>> eeg_drf = H2ORandomForestEstimator(nbins=num, seed=1234) >>> eeg_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(label[key], 'training score', ... eeg_drf.auc(train=True)) >>> print(label[key], 'validation score', ... eeg_drf.auc(train=True)) """ return self._parms.get("nbins") @nbins.setter def nbins(self, nbins): assert_is_type(nbins, None, int) self._parms["nbins"] = nbins @property def nbins_top_level(self): """ For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level Type: ``int``, defaults to ``1024``. :examples: >>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv") >>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor() >>> predictors = eeg.columns[:-1] >>> response = 'eyeDetection' >>> train, valid = eeg.split_frame(ratios=[.8], ... seed=1234) >>> bin_num = [32, 64, 128, 256, 512, ... 1024, 2048, 4096] >>> label = ["32", "64", "128", "256", ... "512", "1024", "2048", "4096"] >>> for key, num in enumerate(bin_num): # Insert integer for 'num' and 'key' >>> eeg_drf = H2ORandomForestEstimator(nbins_top_level=32, ... seed=1234) >>> eeg_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(label[key], 'training score', ... eeg_gbm.auc(train=True)) >>> print(label[key], 'validation score', ... eeg_gbm.auc(valid=True)) """ return self._parms.get("nbins_top_level") @nbins_top_level.setter def nbins_top_level(self, nbins_top_level): assert_is_type(nbins_top_level, None, int) self._parms["nbins_top_level"] = nbins_top_level @property def nbins_cats(self): """ For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting. Type: ``int``, defaults to ``1024``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> bin_num = [8, 16, 32, 64, 128, 256, ... 512, 1024, 2048, 4096] >>> label = ["8", "16", "32", "64", "128", ... "256", "512", "1024", "2048", "4096"] >>> for key, num in enumerate(bin_num): # Insert integer for 'num' and 'key' >>> airlines_drf = H2ORandomForestEstimator(nbins_cats=num, ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(label[key], 'training score', ... airlines_gbm.auc(train=True)) >>> print(label[key], 'validation score', ... airlines_gbm.auc(valid=True)) """ return self._parms.get("nbins_cats") @nbins_cats.setter def nbins_cats(self, nbins_cats): assert_is_type(nbins_cats, None, int) self._parms["nbins_cats"] = nbins_cats @property def r2_stopping(self): """ r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or exceeds this Type: ``float``, defaults to ``∞``. """ return self._parms.get("r2_stopping") @r2_stopping.setter def r2_stopping(self, r2_stopping): assert_is_type(r2_stopping, None, numeric) self._parms["r2_stopping"] = r2_stopping @property def stopping_rounds(self): """ Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Type: ``int``, defaults to ``0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_drf.auc(valid=True) """ return self._parms.get("stopping_rounds") @stopping_rounds.setter def stopping_rounds(self, stopping_rounds): assert_is_type(stopping_rounds, None, int) self._parms["stopping_rounds"] = stopping_rounds @property def stopping_metric(self): """ Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anonomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. Type: ``Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_drf.auc(valid=True) """ return self._parms.get("stopping_metric") @stopping_metric.setter def stopping_metric(self, stopping_metric): assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing")) self._parms["stopping_metric"] = stopping_metric @property def stopping_tolerance(self): """ Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Type: ``float``, defaults to ``0.001``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_drf = H2ORandomForestEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_drf.auc(valid=True) """ return self._parms.get("stopping_tolerance") @stopping_tolerance.setter def stopping_tolerance(self, stopping_tolerance): assert_is_type(stopping_tolerance, None, numeric) self._parms["stopping_tolerance"] = stopping_tolerance @property def max_runtime_secs(self): """ Maximum allowed runtime in seconds for model training. Use 0 to disable. Type: ``float``, defaults to ``0.0``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(max_runtime_secs=10, ... ntrees=10000, ... max_depth=10, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_drf.auc(valid = True) """ return self._parms.get("max_runtime_secs") @max_runtime_secs.setter def max_runtime_secs(self, max_runtime_secs): assert_is_type(max_runtime_secs, None, numeric) self._parms["max_runtime_secs"] = max_runtime_secs @property def seed(self): """ Seed for pseudo random number generator (if applicable) Type: ``int``, defaults to ``-1``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> drf_w_seed_1 = H2ORandomForestEstimator(seed=1234) >>> drf_w_seed_1.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('auc for the 1st model build with a seed:', ... drf_w_seed_1.auc(valid=True)) """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def build_tree_one_node(self): """ Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(build_tree_one_node=True, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_drf.auc(valid=True) """ return self._parms.get("build_tree_one_node") @build_tree_one_node.setter def build_tree_one_node(self, build_tree_one_node): assert_is_type(build_tree_one_node, None, bool) self._parms["build_tree_one_node"] = build_tree_one_node @property def mtries(self): """ Number of variables randomly sampled as candidates at each split. If set to -1, defaults to sqrt{p} for classification and p/3 for regression (where p is the # of predictors Type: ``int``, defaults to ``-1``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> cov_drf = H2ORandomForestEstimator(mtries=30, seed=1234) >>> cov_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('logloss', cov_drf.logloss(valid=True)) """ return self._parms.get("mtries") @mtries.setter def mtries(self, mtries): assert_is_type(mtries, None, int) self._parms["mtries"] = mtries @property def sample_rate(self): """ Row sample rate per tree (from 0.0 to 1.0) Type: ``float``, defaults to ``0.632``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_drf = H2ORandomForestEstimator(sample_rate=.7, ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_drf.auc(valid=True)) """ return self._parms.get("sample_rate") @sample_rate.setter def sample_rate(self, sample_rate): assert_is_type(sample_rate, None, numeric) self._parms["sample_rate"] = sample_rate @property def sample_rate_per_class(self): """ A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree Type: ``List[float]``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> print(train[response].table()) >>> rate_per_class_list = [1, .4, 1, 1, 1, 1, 1] >>> cov_drf = H2ORandomForestEstimator(sample_rate_per_class=rate_per_class_list, ... seed=1234) >>> cov_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('logloss', cov_drf.logloss(valid=True)) """ return self._parms.get("sample_rate_per_class") @sample_rate_per_class.setter def sample_rate_per_class(self, sample_rate_per_class): assert_is_type(sample_rate_per_class, None, [numeric]) self._parms["sample_rate_per_class"] = sample_rate_per_class @property def binomial_double_trees(self): """ For binary classification: Build 2x as many trees (one per class) - can lead to higher accuracy. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(binomial_double_trees=False, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('without binomial_double_trees:', ... cars_drf.auc(valid=True)) >>> cars_drf_2 = H2ORandomForestEstimator(binomial_double_trees=True, ... seed=1234) >>> cars_drf_2.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('with binomial_double_trees:', cars_drf_2.auc(valid=True)) """ return self._parms.get("binomial_double_trees") @binomial_double_trees.setter def binomial_double_trees(self, binomial_double_trees): assert_is_type(binomial_double_trees, None, bool) self._parms["binomial_double_trees"] = binomial_double_trees @property def checkpoint(self): """ Model checkpoint to resume training with. Type: ``Union[None, str, H2OEstimator]``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], ... seed=1234) >>> cars_drf = H2ORandomForestEstimator(ntrees=1, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cars_drf.auc(valid=True)) """ return self._parms.get("checkpoint") @checkpoint.setter def checkpoint(self, checkpoint): assert_is_type(checkpoint, None, str, H2OEstimator) self._parms["checkpoint"] = checkpoint @property def col_sample_rate_change_per_level(self): """ Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_drf = H2ORandomForestEstimator(col_sample_rate_change_per_level=.9, ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_drf.auc(valid=True)) """ return self._parms.get("col_sample_rate_change_per_level") @col_sample_rate_change_per_level.setter def col_sample_rate_change_per_level(self, col_sample_rate_change_per_level): assert_is_type(col_sample_rate_change_per_level, None, numeric) self._parms["col_sample_rate_change_per_level"] = col_sample_rate_change_per_level @property def col_sample_rate_per_tree(self): """ Column sample rate per tree (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_drf = H2ORandomForestEstimator(col_sample_rate_per_tree=.7, ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_drf.auc(valid=True)) """ return self._parms.get("col_sample_rate_per_tree") @col_sample_rate_per_tree.setter def col_sample_rate_per_tree(self, col_sample_rate_per_tree): assert_is_type(col_sample_rate_per_tree, None, numeric) self._parms["col_sample_rate_per_tree"] = col_sample_rate_per_tree @property def min_split_improvement(self): """ Minimum relative improvement in squared error reduction for a split to happen Type: ``float``, defaults to ``1e-05``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(min_split_improvement=1e-3, ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cars_drf.auc(valid=True)) """ return self._parms.get("min_split_improvement") @min_split_improvement.setter def min_split_improvement(self, min_split_improvement): assert_is_type(min_split_improvement, None, numeric) self._parms["min_split_improvement"] = min_split_improvement @property def histogram_type(self): """ What type of histogram to use for finding optimal split points Type: ``Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_drf = H2ORandomForestEstimator(histogram_type="UniformAdaptive", ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_drf.auc(valid=True)) """ return self._parms.get("histogram_type") @histogram_type.setter def histogram_type(self, histogram_type): assert_is_type(histogram_type, None, Enum("auto", "uniform_adaptive", "random", "quantiles_global", "round_robin")) self._parms["histogram_type"] = histogram_type @property def categorical_encoding(self): """ Encoding scheme for categorical features Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> encoding = "one_hot_explicit" >>> airlines_drf = H2ORandomForestEstimator(categorical_encoding=encoding, ... seed=1234) >>> airlines_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_drf.auc(valid=True) """ return self._parms.get("categorical_encoding") @categorical_encoding.setter def categorical_encoding(self, categorical_encoding): assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited")) self._parms["categorical_encoding"] = categorical_encoding @property def calibrate_model(self): """ Use Platt Scaling to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities. Type: ``bool``, defaults to ``False``. :examples: >>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv") >>> ecology['Angaus'] = ecology['Angaus'].asfactor() >>> from h2o.estimators.random_forest import H2ORandomForestEstimator >>> response = 'Angaus' >>> predictors = ecology.columns[3:13] >>> train, calib = ecology.split_frame(seed=12354) >>> w = h2o.create_frame(binary_fraction=1, ... binary_ones_fraction=0.5, ... missing_fraction=0, ... rows=744, cols=1) >>> w.set_names(["weight"]) >>> train = train.cbind(w) >>> ecology_drf = H2ORandomForestEstimator(ntrees=10, ... max_depth=5, ... min_rows=10, ... distribution="multinomial", ... weights_column="weight", ... calibrate_model=True, ... calibration_frame=calib) >>> ecology_drf.train(x=predictors, ... y="Angaus", ... training_frame=train) >>> predicted = ecology_drf.predict(calib) """ return self._parms.get("calibrate_model") @calibrate_model.setter def calibrate_model(self, calibrate_model): assert_is_type(calibrate_model, None, bool) self._parms["calibrate_model"] = calibrate_model @property def calibration_frame(self): """ Calibration frame for Platt Scaling Type: ``Union[None, str, H2OFrame]``. :examples: >>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv") >>> ecology['Angaus'] = ecology['Angaus'].asfactor() >>> response = 'Angaus' >>> predictors = ecology.columns[3:13] >>> train, calib = ecology.split_frame(seed = 12354) >>> w = h2o.create_frame(binary_fraction=1, ... binary_ones_fraction=0.5, ... missing_fraction=0, ... rows=744, cols=1) >>> w.set_names(["weight"]) >>> train = train.cbind(w) >>> ecology_drf = H2ORandomForestEstimator(ntrees=10, ... max_depth=5, ... min_rows=10, ... distribution="multinomial", ... calibrate_model=True, ... calibration_frame=calib) >>> ecology_drf.train(x=predictors, ... y="Angaus, ... training_frame=train, ... weights_column="weight") >>> predicted = ecology_drf.predict(train) """ return self._parms.get("calibration_frame") @calibration_frame.setter def calibration_frame(self, calibration_frame): self._parms["calibration_frame"] = H2OFrame._validate(calibration_frame, 'calibration_frame') @property def distribution(self): """ Distribution function Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]``, defaults to ``"auto"``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "cylinders" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_drf = H2ORandomForestEstimator(distribution="poisson", ... seed=1234) >>> cars_drf.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_drf.mse(valid=True) """ return self._parms.get("distribution") @distribution.setter def distribution(self, distribution): assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber")) self._parms["distribution"] = distribution @property def custom_metric_func(self): """ Reference to custom evaluation function, format: `language:keyName=funcName` Type: ``str``. """ return self._parms.get("custom_metric_func") @custom_metric_func.setter def custom_metric_func(self, custom_metric_func): assert_is_type(custom_metric_func, None, str) self._parms["custom_metric_func"] = custom_metric_func @property def export_checkpoints_dir(self): """ Automatically export generated models to this directory. Type: ``str``. :examples: >>> import tempfile >>> from os import listdir >>> from h2o.grid.grid_search import H2OGridSearch >>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex") >>> predictors = ["DayofMonth", "DayOfWeek"] >>> response = "IsDepDelayed" >>> hyper_parameters = {'ntrees': [5,10]} >>> search_crit = {'strategy': "RandomDiscrete", ... 'max_models': 5, ... 'seed': 1234, ... 'stopping_rounds': 3, ... 'stopping_metric': "AUTO", ... 'stopping_tolerance': 1e-2} >>> checkpoints_dir = tempfile.mkdtemp() >>> air_grid = H2OGridSearch(H2ORandomForestEstimator, ... hyper_params=hyper_parameters, ... search_criteria=search_crit) >>> air_grid.train(x=predictors, ... y=response, ... training_frame=airlines, ... distribution="bernoulli", ... max_depth=3, ... export_checkpoints_dir=checkpoints_dir) >>> num_files = len(listdir(checkpoints_dir)) >>> num_files """ return self._parms.get("export_checkpoints_dir") @export_checkpoints_dir.setter def export_checkpoints_dir(self, export_checkpoints_dir): assert_is_type(export_checkpoints_dir, None, str) self._parms["export_checkpoints_dir"] = export_checkpoints_dir @property def check_constant_response(self): """ Check if response column is constant. If enabled, then an exception is thrown if the response column is a constant value.If disabled, then model will train regardless of the response column being a constant value or not. Type: ``bool``, defaults to ``True``. :examples: >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> train["constantCol"] = 1 >>> my_drf = H2ORandomForestEstimator(check_constant_response=False) >>> my_drf.train(x=list(range(1,5)), ... y="constantCol", ... training_frame=train) """ return self._parms.get("check_constant_response") @check_constant_response.setter def check_constant_response(self, check_constant_response): assert_is_type(check_constant_response, None, bool) self._parms["check_constant_response"] = check_constant_response @property def gainslift_bins(self): """ Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning. Type: ``int``, defaults to ``-1``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv") >>> model = H2ORandomForestEstimator(ntrees=1, gainslift_bins=20) >>> model.train(x=["Origin", "Distance"], ... y="IsDepDelayed", ... training_frame=airlines) >>> model.gains_lift() """ return self._parms.get("gainslift_bins") @gainslift_bins.setter def gainslift_bins(self, gainslift_bins): assert_is_type(gainslift_bins, None, int) self._parms["gainslift_bins"] = gainslift_bins @property def auc_type(self): """ Set default multinomial AUC type. Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to ``"auto"``. """ return self._parms.get("auc_type") @auc_type.setter def auc_type(self, auc_type): assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo")) self._parms["auc_type"] = auc_type offset_column = deprecated_property('offset_column', None)