Source code for h2o.estimators.targetencoder

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric
import h2o


[docs]class H2OTargetEncoderEstimator(H2OEstimator): """ TargetEncoder """ algo = "targetencoder" param_names = {"blending", "k", "f", "data_leakage_handling", "noise_level", "seed", "model_id", "ignored_columns", "training_frame", "fold_column", "response_column"} def __init__(self, **kwargs): super(H2OTargetEncoderEstimator, self).__init__() self._parms = {} for pname, pvalue in kwargs.items(): if pname == 'model_id': self._id = pvalue self._parms["model_id"] = pvalue elif pname in self.param_names: # Using setattr(...) will invoke type-checking of the arguments setattr(self, pname, pvalue) else: raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue)) @property def blending(self): """ Blending enabled/disabled Type: ``bool`` (default: ``False``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic["survived"] = titanic["survived"].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(k=35, ... f=25, ... blending=True) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_te """ return self._parms.get("blending") @blending.setter def blending(self, blending): assert_is_type(blending, None, bool) self._parms["blending"] = blending @property def k(self): """ Inflection point. Used for blending (if enabled). Blending is to be enabled separately using the 'blending' parameter. Type: ``float`` (default: ``10``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic["survived"] = titanic["survived"].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(k=35, ... f=25, ... blending=True) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_te """ return self._parms.get("k") @k.setter def k(self, k): assert_is_type(k, None, numeric) self._parms["k"] = k @property def f(self): """ Smoothing. Used for blending (if enabled). Blending is to be enabled separately using the 'blending' parameter. Type: ``float`` (default: ``20``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic["survived"] = titanic["survived"].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(k=35, ... f=25, ... blending=True) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_te """ return self._parms.get("f") @f.setter def f(self, f): assert_is_type(f, None, numeric) self._parms["f"] = f @property def data_leakage_handling(self): """ Data leakage handling strategy. One of: ``"none"``, ``"k_fold"``, ``"leave_one_out"`` (default: ``"none"``). :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic["survived"] = titanic["survived"].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(k=35, ... f=25, ... data_leakage_handling="k_fold", ... blending=True) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_te """ return self._parms.get("data_leakage_handling") @data_leakage_handling.setter def data_leakage_handling(self, data_leakage_handling): assert_is_type(data_leakage_handling, None, Enum("none", "k_fold", "leave_one_out")) self._parms["data_leakage_handling"] = data_leakage_handling @property def noise_level(self): """ Noise level Type: ``float`` (default: ``0.01``). """ return self._parms.get("noise_level") @noise_level.setter def noise_level(self, noise_level): assert_is_type(noise_level, None, numeric) self._parms["noise_level"] = noise_level @property def seed(self): """ Seed for the specified noise level Type: ``int`` (default: ``-1``). """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def training_frame(self): """ Id of the training data frame. Type: ``H2OFrame``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic["survived"] = titanic["survived"].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(k=35, ... f=25, ... blending=True) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_te """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def fold_column(self): """ Column with cross-validation fold index assignment per observation. Type: ``str``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic["survived"] = titanic["survived"].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(k=35, ... f=25, ... blending=True) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_te """ return self._parms.get("fold_column") @fold_column.setter def fold_column(self, fold_column): assert_is_type(fold_column, None, str) self._parms["fold_column"] = fold_column @property def response_column(self): """ Response variable column. Type: ``str``. """ return self._parms.get("response_column") @response_column.setter def response_column(self, response_column): assert_is_type(response_column, None, str) self._parms["response_column"] = response_column
[docs] def transform(self, frame, data_leakage_handling="None", noise=-1, seed=-1): """ Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call. :param H2OFrame frame: to which frame we are applying target encoding transformations. :param str data_leakage_handling: Supported options: 1) "k_fold" - encodings for a fold are generated based on out-of-fold data. 2) "leave_one_out" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies. 3) "none" - we do not holdout anything. Using whole frame for training :param float noise: the amount of random noise added to the target encoding. This helps prevent overfitting. Defaults to 0.01 * range of y. :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1. :example: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic[response] = titanic[response].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(k=35, ... f=25, ... data_leakage_handling="leave_one_out", ... blending=True) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> transformed = titanic_te.transform(frame=titanic, ... data_leakage_handling="leave_one_out", ... seed=1234) """ output = h2o.api("GET /3/TargetEncoderTransform", data={'model': self.model_id, 'frame': frame.key, 'data_leakage_handling': data_leakage_handling, 'noise': noise, 'seed': seed}) return h2o.get_frame(output["name"])