#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals
from h2o.utils.metaclass import deprecated_params, deprecated_property
import h2o
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric
[docs]class H2ORuleFitEstimator(H2OEstimator):
"""
RuleFit
Builds a RuleFit on a parsed dataset, for regression or
classification.
"""
algo = "rulefit"
supervised_learning = True
@deprecated_params({'Lambda': 'lambda_'})
def __init__(self,
model_id=None, # type: Optional[Union[None, str, H2OEstimator]]
training_frame=None, # type: Optional[Union[None, str, H2OFrame]]
validation_frame=None, # type: Optional[Union[None, str, H2OFrame]]
seed=-1, # type: int
response_column=None, # type: Optional[str]
ignored_columns=None, # type: Optional[List[str]]
algorithm="auto", # type: Literal["auto", "drf", "gbm"]
min_rule_length=3, # type: int
max_rule_length=3, # type: int
max_num_rules=-1, # type: int
model_type="rules_and_linear", # type: Literal["rules_and_linear", "rules", "linear"]
weights_column=None, # type: Optional[str]
distribution="auto", # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]
rule_generation_ntrees=50, # type: int
auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
remove_duplicates=True, # type: bool
lambda_=None, # type: Optional[List[float]]
max_categorical_levels=10, # type: int
):
"""
:param model_id: Destination id for this model; auto-generated if not specified.
Defaults to ``None``.
:type model_id: Union[None, str, H2OEstimator], optional
:param training_frame: Id of the training data frame.
Defaults to ``None``.
:type training_frame: Union[None, str, H2OFrame], optional
:param validation_frame: Id of the validation data frame.
Defaults to ``None``.
:type validation_frame: Union[None, str, H2OFrame], optional
:param seed: Seed for pseudo random number generator (if applicable).
Defaults to ``-1``.
:type seed: int
:param response_column: Response variable column.
Defaults to ``None``.
:type response_column: str, optional
:param ignored_columns: Names of columns to ignore for training.
Defaults to ``None``.
:type ignored_columns: List[str], optional
:param algorithm: The algorithm to use to generate rules.
Defaults to ``"auto"``.
:type algorithm: Literal["auto", "drf", "gbm"]
:param min_rule_length: Minimum length of rules. Defaults to 3.
Defaults to ``3``.
:type min_rule_length: int
:param max_rule_length: Maximum length of rules. Defaults to 3.
Defaults to ``3``.
:type max_rule_length: int
:param max_num_rules: The maximum number of rules to return. defaults to -1 which means the number of rules is
selected
by diminishing returns in model deviance.
Defaults to ``-1``.
:type max_num_rules: int
:param model_type: Specifies type of base learners in the ensemble.
Defaults to ``"rules_and_linear"``.
:type model_type: Literal["rules_and_linear", "rules", "linear"]
:param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
not increase the size of the data frame. This is typically the number of times a row is repeated, but
non-integer values are supported as well. During training, rows with higher weights matter more, due to
the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
Defaults to ``None``.
:type weights_column: str, optional
:param distribution: Distribution function
Defaults to ``"auto"``.
:type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
"quantile", "huber"]
:param rule_generation_ntrees: Specifies the number of trees to build in the tree model. Defaults to 50.
Defaults to ``50``.
:type rule_generation_ntrees: int
:param auc_type: Set default multinomial AUC type.
Defaults to ``"auto"``.
:type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
:param remove_duplicates: Whether to remove rules which are identical to an earlier rule. Defaults to true.
Defaults to ``True``.
:type remove_duplicates: bool
:param lambda_: Lambda for LASSO regressor.
Defaults to ``None``.
:type lambda_: List[float], optional
:param max_categorical_levels: For every categorical feature, only use this many most frequent categorical
levels for model training. Only used for categorical_encoding == EnumLimited.
Defaults to ``10``.
:type max_categorical_levels: int
"""
super(H2ORuleFitEstimator, self).__init__()
self._parms = {}
self._id = self._parms['model_id'] = model_id
self.training_frame = training_frame
self.validation_frame = validation_frame
self.seed = seed
self.response_column = response_column
self.ignored_columns = ignored_columns
self.algorithm = algorithm
self.min_rule_length = min_rule_length
self.max_rule_length = max_rule_length
self.max_num_rules = max_num_rules
self.model_type = model_type
self.weights_column = weights_column
self.distribution = distribution
self.rule_generation_ntrees = rule_generation_ntrees
self.auc_type = auc_type
self.remove_duplicates = remove_duplicates
self.lambda_ = lambda_
self.max_categorical_levels = max_categorical_levels
@property
def training_frame(self):
"""
Id of the training data frame.
Type: ``Union[None, str, H2OFrame]``.
"""
return self._parms.get("training_frame")
@training_frame.setter
def training_frame(self, training_frame):
self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
@property
def validation_frame(self):
"""
Id of the validation data frame.
Type: ``Union[None, str, H2OFrame]``.
"""
return self._parms.get("validation_frame")
@validation_frame.setter
def validation_frame(self, validation_frame):
self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
@property
def seed(self):
"""
Seed for pseudo random number generator (if applicable).
Type: ``int``, defaults to ``-1``.
"""
return self._parms.get("seed")
@seed.setter
def seed(self, seed):
assert_is_type(seed, None, int)
self._parms["seed"] = seed
@property
def response_column(self):
"""
Response variable column.
Type: ``str``.
"""
return self._parms.get("response_column")
@response_column.setter
def response_column(self, response_column):
assert_is_type(response_column, None, str)
self._parms["response_column"] = response_column
@property
def ignored_columns(self):
"""
Names of columns to ignore for training.
Type: ``List[str]``.
"""
return self._parms.get("ignored_columns")
@ignored_columns.setter
def ignored_columns(self, ignored_columns):
assert_is_type(ignored_columns, None, [str])
self._parms["ignored_columns"] = ignored_columns
@property
def algorithm(self):
"""
The algorithm to use to generate rules.
Type: ``Literal["auto", "drf", "gbm"]``, defaults to ``"auto"``.
"""
return self._parms.get("algorithm")
@algorithm.setter
def algorithm(self, algorithm):
assert_is_type(algorithm, None, Enum("auto", "drf", "gbm"))
self._parms["algorithm"] = algorithm
@property
def min_rule_length(self):
"""
Minimum length of rules. Defaults to 3.
Type: ``int``, defaults to ``3``.
"""
return self._parms.get("min_rule_length")
@min_rule_length.setter
def min_rule_length(self, min_rule_length):
assert_is_type(min_rule_length, None, int)
self._parms["min_rule_length"] = min_rule_length
@property
def max_rule_length(self):
"""
Maximum length of rules. Defaults to 3.
Type: ``int``, defaults to ``3``.
"""
return self._parms.get("max_rule_length")
@max_rule_length.setter
def max_rule_length(self, max_rule_length):
assert_is_type(max_rule_length, None, int)
self._parms["max_rule_length"] = max_rule_length
@property
def max_num_rules(self):
"""
The maximum number of rules to return. defaults to -1 which means the number of rules is selected
by diminishing returns in model deviance.
Type: ``int``, defaults to ``-1``.
"""
return self._parms.get("max_num_rules")
@max_num_rules.setter
def max_num_rules(self, max_num_rules):
assert_is_type(max_num_rules, None, int)
self._parms["max_num_rules"] = max_num_rules
@property
def model_type(self):
"""
Specifies type of base learners in the ensemble.
Type: ``Literal["rules_and_linear", "rules", "linear"]``, defaults to ``"rules_and_linear"``.
"""
return self._parms.get("model_type")
@model_type.setter
def model_type(self, model_type):
assert_is_type(model_type, None, Enum("rules_and_linear", "rules", "linear"))
self._parms["model_type"] = model_type
@property
def weights_column(self):
"""
Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
accurate prediction, remove all rows with weight == 0.
Type: ``str``.
"""
return self._parms.get("weights_column")
@weights_column.setter
def weights_column(self, weights_column):
assert_is_type(weights_column, None, str)
self._parms["weights_column"] = weights_column
@property
def distribution(self):
"""
Distribution function
Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
"quantile", "huber"]``, defaults to ``"auto"``.
"""
return self._parms.get("distribution")
@distribution.setter
def distribution(self, distribution):
assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"))
self._parms["distribution"] = distribution
@property
def rule_generation_ntrees(self):
"""
Specifies the number of trees to build in the tree model. Defaults to 50.
Type: ``int``, defaults to ``50``.
"""
return self._parms.get("rule_generation_ntrees")
@rule_generation_ntrees.setter
def rule_generation_ntrees(self, rule_generation_ntrees):
assert_is_type(rule_generation_ntrees, None, int)
self._parms["rule_generation_ntrees"] = rule_generation_ntrees
@property
def auc_type(self):
"""
Set default multinomial AUC type.
Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to
``"auto"``.
"""
return self._parms.get("auc_type")
@auc_type.setter
def auc_type(self, auc_type):
assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"))
self._parms["auc_type"] = auc_type
@property
def remove_duplicates(self):
"""
Whether to remove rules which are identical to an earlier rule. Defaults to true.
Type: ``bool``, defaults to ``True``.
"""
return self._parms.get("remove_duplicates")
@remove_duplicates.setter
def remove_duplicates(self, remove_duplicates):
assert_is_type(remove_duplicates, None, bool)
self._parms["remove_duplicates"] = remove_duplicates
@property
def lambda_(self):
"""
Lambda for LASSO regressor.
Type: ``List[float]``.
"""
return self._parms.get("lambda")
@lambda_.setter
def lambda_(self, lambda_):
assert_is_type(lambda_, None, numeric, [numeric])
self._parms["lambda"] = lambda_
@property
def max_categorical_levels(self):
"""
For every categorical feature, only use this many most frequent categorical levels for model training. Only used
for categorical_encoding == EnumLimited.
Type: ``int``, defaults to ``10``.
"""
return self._parms.get("max_categorical_levels")
@max_categorical_levels.setter
def max_categorical_levels(self, max_categorical_levels):
assert_is_type(max_categorical_levels, None, int)
self._parms["max_categorical_levels"] = max_categorical_levels
Lambda = deprecated_property('Lambda', lambda_)
[docs] def rule_importance(self):
"""
Retrieve rule importances for a Rulefit model
:return: H2OTwoDimTable
"""
if self._model_json["algo"] != "rulefit":
raise H2OValueError("This function is available for Rulefit models only")
kwargs = {}
kwargs["model_id"] = self.model_id
json = h2o.api("POST /3/SignificantRules", data=kwargs)
return json['significant_rules_table']
[docs] def predict_rules(self, frame, rule_ids):
"""
Evaluates validity of the given rules on the given data.
:param frame: H2OFrame on which rule validity is to be evaluated
:param rule_ids: string array of rule ids to be evaluated against the frame
:return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not.
"""
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type
from h2o.expr import ExprNode
assert_is_type(frame, H2OFrame)
return H2OFrame._expr(expr=ExprNode("rulefit.predict.rules", self, frame, rule_ids))