# -*- encoding: utf-8 -*-
"""
H2O TargetEncoder.
:copyright: (c) 2016 H2O.ai
:license: Apache License Version 2.0 (see LICENSE for details)
"""
from __future__ import absolute_import, division, print_function, unicode_literals
from h2o.expr import ExprNode
from h2o.frame import H2OFrame
from h2o.utils.typechecks import (assert_is_type)
from h2o import get_frame
import warnings
__all__ = ("TargetEncoder", )
[docs]class TargetEncoder(object):
"""
Deprecated API. Please use H2OTargetencoderEstimator instead.
This is a main class that provides Python's API to the Java implementation of the target encoding.
In general target encoding could be applied to three types of problems, namely:
1) Binary classification (supported)
2) Multi-class classification (not supported yet)
3) Regression (not supported yet)
:param List[str]-or-List[int] x: List of categorical column names or indices that we want apply target encoding to.
:param str-or-int y: the name or column index of the response variable in the data.
:param str-or-int fold_column: the name or column index of the fold column in the data.
:param boolean blending_avg: (deprecated) whether to perform blended average. Defaults to TRUE.
:param boolean blended_avg: whether to perform blended average. Defaults to TRUE.
:param double inflection_point: parameter for blending. Used to calculate `lambda`. Determines half of the minimal sample size
for which we completely trust the estimate based on the sample in the particular level of categorical variable. Default value is 10.
:param double smoothing: parameter for blending. Used to calculate `lambda`. Controls the rate of transition between
the particular level's posterior probability and the prior probability. For smoothing values approaching infinity it becomes a hard
threshold between the posterior and the prior probability. Default value is 20.
:examples:
>>> targetEncoder = TargetEncoder(x=te_columns, y=responseColumnName, blended_avg=True, inflection_point=10, smoothing=20)
>>> targetEncoder.fit(trainFrame)
>>> encodedTrain = targetEncoder.transform(frame=trainFrame, holdout_type="kfold", seed=1234, is_train_or_valid=True)
>>> encodedValid = targetEncoder.transform(frame=validFrame, holdout_type="none", noise=0.0, is_train_or_valid=True)
>>> encodedTest = targetEncoder.transform(frame=testFrame, holdout_type="none", noise=0.0, is_train_or_valid=False)
"""
#-------------------------------------------------------------------------------------------------------------------
# Construction
#-------------------------------------------------------------------------------------------------------------------
def __init__(self, x=None, y=None, fold_column='', blended_avg=True, inflection_point=10, smoothing=20, **kwargs):
"""
Deprecated API. Please use H2OTargetencoderEstimator instead.
Creates instance of the TargetEncoder class and setting parameters that will be used in both `train` and `transform` methods.
"""
if(type(x) == str or type(x) == int):
x = [x]
self._teColumns = x
self._responseColumnName = y
self._foldColumnName = fold_column
if 'blending_avg' in kwargs:
warnings.warn("Parameter blending_avg is deprecated; use blended_avg instead", category=DeprecationWarning, stacklevel=2)
self._blending = kwargs.get('blending_avg')
else:
self._blending = blended_avg
if not inflection_point > 0:
raise ValueError("Parameter `inflection_point` should be greater than 0")
if not smoothing > 0:
raise ValueError("Parameter `smoothing` should be greater than 0")
self._inflectionPoint = inflection_point
self._smoothing = smoothing
[docs] def fit(self, frame = None):
"""
Deprecated API. Please use H2OTargetencoderEstimator instead.
Returns encoding map as an object that maps 'column_name' -> 'frame_with_encoding_map_for_this_column_name'
:param frame frame: An H2OFrame object with which to create the target encoding map
:examples:
>>> targetEncoder = TargetEncoder(x=te_columns, y=responseColumnName, blended_avg=True, inflection_point=10, smoothing=20)
>>> targetEncoder.fit(trainFrame)
"""
self._teColumns = list(map(lambda i: frame.names[i], self._teColumns)) if all(isinstance(n, int) for n in self._teColumns) else self._teColumns
self._responseColumnName = frame.names[self._responseColumnName] if isinstance(self._responseColumnName, int) else self._responseColumnName
self._foldColumnName = frame.names[self._foldColumnName] if isinstance(self._foldColumnName, int) else self._foldColumnName
self._encodingMap = ExprNode("target.encoder.fit", frame, self._teColumns, self._responseColumnName,
self._foldColumnName)._eager_map_frame()
return self._encodingMap
[docs] def encoding_map_frames(self):
return list(map(lambda x: get_frame(x['key']['name']), self._encodingMap.frames))