# -*- encoding: utf-8 -*-
"""
H2O TargetEncoder.
:copyright: (c) 2016 H2O.ai
:license: Apache License Version 2.0 (see LICENSE for details)
"""
from __future__ import absolute_import, division, print_function, unicode_literals
from h2o.expr import ExprNode
from h2o.frame import H2OFrame
from h2o.utils.typechecks import (assert_is_type)
__all__ = ("TargetEncoder", )
[docs]class TargetEncoder(object):
"""
Status: alpha version
This is a main class that provides Python's API to the Java implementation of the target encoding.
In general target encoding could be applied to three types of problems, namely:
1) Binary classification (supported)
2) Multi-class classification (not supported yet)
3) Regression (not supported yet)
Sample usage:
>>> targetEncoder = TargetEncoder(x=e_columns, y=responseColumnName, blending=True, inflection_point=3, smoothing=1)
>>> targetEncoder.fit(frame)
>>> encodedValid = targetEncoder.transform(frame=frame, holdout_type="kfold", seed=1234, is_train_or_valid=True)
>>> encodedTest = targetEncoder.transform(frame=testFrame, holdout_type="none", noise=0.0, seed=1234, is_train_or_valid=False)
"""
#-------------------------------------------------------------------------------------------------------------------
# Construction
#-------------------------------------------------------------------------------------------------------------------
def __init__(self, x=None, y=None, fold_column='', blending_avg=True, inflection_point=3, smoothing=1):
"""
Creates instance of the TargetEncoder class and setting parameters that will be used in both `train` and `transform` methods.
:param List[str] x: List of categorical column names that we want apply target encoding to
:param str y: response column we will create encodings with
:param str fold_column: fold column if we want to use 'kfold' holdout_type
:param boolean blending_avg: whether to use blending or not
:param double inflection_point: parameter for blending. Used to calculate `lambda`. Parameter determines half of the minimal sample size
for which we completely trust the estimate based on the sample in the particular level of categorical variable.
:param double smoothing: parameter for blending. Used to calculate `lambda`. The parameter f controls the rate of transition between
the particular level's posterior probability and the prior probability. For smoothing values approaching infinity it becomes a hard
threshold between the posterior and the prior probability.
"""
self._teColumns = x
self._responseColumnName = y
self._foldColumnName = fold_column
self._blending = blending_avg
self._inflectionPoint = inflection_point
self._smoothing = smoothing
[docs] def fit(self, frame = None):
"""
Returns encoding map as an object that maps 'column_name' -> 'frame_with_encoding_map_for_this_column_name'
:param frame frame: frame you want to generate encoding map for target encoding based on.
"""
self._encodingMap = ExprNode("target.encoder.fit", frame, self._teColumns, self._responseColumnName,
self._foldColumnName)._eager_map_frame()
return self._encodingMap