Source code for h2o.estimators.estimator_base

from ..model.model_base import ModelBase
from ..model.autoencoder import H2OAutoEncoderModel
from ..model.binomial import H2OBinomialModel
from ..model.clustering import H2OClusteringModel
from ..model.dim_reduction import H2ODimReductionModel
from ..model.multinomial import H2OMultinomialModel
from ..model.regression import H2ORegressionModel
from ..model.metrics_base import *
from ..h2o import H2OConnection, H2OJob, H2OFrame
import h2o
import inspect
import warnings
import types


class EstimatorAttributeError(AttributeError):
  def __init__(self,obj,method):
    super(AttributeError, self).__init__("No {} method for {}".format(method,obj.__class__.__name__))


[docs]class H2OEstimator(ModelBase):
  """H2O Estimators

  H2O Estimators implement the following methods for model construction:
    * start - Top-level user-facing API for asynchronous model build
    * join  - Top-level user-facing API for blocking on async model build
    * train - Top-level user-facing API for model building.
    * fit - Used by scikit-learn.

  Because H2OEstimator instances are instances of ModelBase, these objects can use the
  H2O model API.
  """

[docs]  def start(self,x,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
    """Asynchronous model build by specifying the predictor columns, response column, and any
    additional frame-specific values.

    To block for results, call join.

    Parameters
    ----------
      x : list
        A list of column names or indices indicating the predictor columns.
      y : str
        An index or a column name indicating the response column.
      training_frame : H2OFrame
        The H2OFrame having the columns indicated by x and y (as well as any
        additional columns specified by fold, offset, and weights).
      offset_column : str, optional
        The name or index of the column in training_frame that holds the offsets.
      fold_column : str, optional
        The name or index of the column in training_frame that holds the per-row fold
        assignments.
      weights_column : str, optional
        The name or index of the column in training_frame that holds the per-row weights.
      validation_frame : H2OFrame, optional
        H2OFrame with validation data to be scored on while training.
    """
    self._future=True
    self.train(x=x,
               y=y,
               training_frame=training_frame,
               offset_column=offset_column,
               fold_column=fold_column,
               weights_column=weights_column,
               validation_frame=validation_frame,
               **params)

[docs]  def join(self):
    self._future=False
    self._job.poll()
    self._job=None

[docs]  def train(self,x,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,max_runtime_secs=None,**params):
    """Train the H2O model by specifying the predictor columns, response column, and any
    additional frame-specific values.

    Parameters
    ----------

    x : list
      A list of column names or indices indicating the predictor columns.
    y : str
      An index or a column name indicating the response column.
    training_frame : H2OFrame
      The H2OFrame having the columns indicated by x and y (as well as any
      additional columns specified by fold, offset, and weights).
    offset_column : str, optional
      The name or index of the column in training_frame that holds the offsets.
    fold_column : str, optional
      The name or index of the column in training_frame that holds the per-row fold
      assignments.
    weights_column : str, optional
      The name or index of the column in training_frame that holds the per-row weights.
    validation_frame : H2OFrame, optional
      H2OFrame with validation data to be scored on while training.
    max_runtime_secs : float
      Maximum allowed runtime in seconds for model training. Use 0 to disable.
    """
    algo_params = locals()
    parms = self._parms.copy()
    if '__class__' in parms:  # FIXME: hackt for PY3
      del parms['__class__']
    parms.update({k:v for k, v in algo_params.items() if k not in ["self","params", "algo_params", "parms"] })
    y = algo_params["y"]
    tframe = algo_params["training_frame"]
    if tframe is None: raise ValueError("Missing training_frame")
    if y is not None:
      if isinstance(y, (list, tuple)):
        if len(y) == 1: parms["y"] = y[0]
        else: raise ValueError('y must be a single column reference')
      self._estimator_type = "classifier" if tframe[y].isfactor() else "regressor"
    self.build_model(parms)

[docs]  def build_model(self, algo_params):
    if algo_params["training_frame"] is None: raise ValueError("Missing training_frame")
    x = algo_params.pop("x")
    y = algo_params.pop("y",None)
    training_frame = algo_params.pop("training_frame")
    validation_frame = algo_params.pop("validation_frame",None)
    is_auto_encoder = (algo_params is not None) and ("autoencoder" in algo_params and algo_params["autoencoder"])
    algo = self._compute_algo()
    is_unsupervised = is_auto_encoder or algo == "pca" or algo == "svd" or algo == "kmeans" or algo == "glrm"
    if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
    if not is_unsupervised and y is None: raise ValueError("Missing response")
    self._model_build(x, y, training_frame, validation_frame, algo_params)

  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if isinstance(y, int): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if isinstance(x[0], int):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns]
    kwargs = dict([(k, (kwargs[k]).frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None])  # gruesome one-liner
    algo = self._compute_algo()

    model = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo, **kwargs), job_type=(algo+" Model Build"))

    if self._future:
      self._job = model
      return

    model.poll()
    if '_rest_version' in list(kwargs.keys()): model_json = H2OConnection.get_json("Models/"+model.dest_key, _rest_version=kwargs['_rest_version'])["models"][0]
    else:                                model_json = H2OConnection.get_json("Models/"+model.dest_key)["models"][0]
    self._resolve_model(model.dest_key,model_json)

  def _resolve_model(self, model_id, model_json):
    metrics_class, model_class = H2OEstimator._metrics_class(model_json)
    m = model_class()
    m._id = model_id
    m._model_json = model_json
    m._metrics_class = metrics_class
    m._parms = self._parms
    m._estimator_type = self._estimator_type

    if model_id is not None and model_json is not None and metrics_class is not None:
      # build Metric objects out of each metrics
      for metric in ["training_metrics", "validation_metrics", "cross_validation_metrics"]:
        if metric in model_json["output"]:
          if model_json["output"][metric] is not None:
            if metric=="cross_validation_metrics":
              m._is_xvalidated=True
            model_json["output"][metric] = metrics_class(model_json["output"][metric],metric,model_json["algo"])

      if m._is_xvalidated: m._xval_keys= [i["name"] for i in model_json["output"]["cross_validation_models"]]

      # build a useful dict of the params
      for p in m._model_json["parameters"]: m.parms[p["label"]]=p
    H2OEstimator.mixin(self,model_class)
    self.__dict__.update(m.__dict__.copy())

  def _compute_algo(self):
    name = self.__class__.__name__
    if name == "H2ODeepLearningEstimator":       return "deeplearning"
    if name == "H2OAutoEncoderEstimator":        return "deeplearning"
    if name == "H2OGradientBoostingEstimator":   return "gbm"
    if name == "H2OGeneralizedLinearEstimator":  return "glm"
    if name == "H2OGeneralizedLowRankEstimator": return "glrm"
    if name == "H2OKMeansEstimator":             return "kmeans"
    if name == "H2ONaiveBayesEstimator":         return "naivebayes"
    if name == "H2ORandomForestEstimator":       return "drf"
    if name == "H2OPCA":                         return "pca"
    if name == "H2OSVD":                         return "svd"

  @staticmethod
[docs]  def mixin(obj,cls):
    for name in cls.__dict__:
      if name.startswith('__') and name.endswith('__') or not type(cls.__dict__[name])==types.FunctionType:
        continue
      obj.__dict__[name]=cls.__dict__[name].__get__(obj)

  ##### Scikit-learn Interface Methods #####
[docs]  def fit(self, X, y=None, **params):
    """Fit an H2O model as part of a scikit-learn pipeline or grid search.

    A warning will be issued if a caller other than sklearn attempts to use this method.

    Parameters
    ----------
      X : H2OFrame
        An H2OFrame consisting of the predictor variables.
      y : H2OFrame, optional
        An H2OFrame consisting of the response variable.
      params : optional
        Extra arguments.

    Returns
    -------
      The current instance of H2OEstimator for method chaining.
    """
    stk = inspect.stack()[1:]
    warn = True
    for s in stk:
      mod = inspect.getmodule(s[0])
      if mod:
        warn = "sklearn" not in mod.__name__
        if not warn: break
    if warn:
      warnings.warn("\n\n\t`fit` is not recommended outside of the sklearn framework. Use `train` instead.", UserWarning, stacklevel=2)
    training_frame = X.cbind(y) if y is not None else X
    X = X.names
    y = y.names[0] if y is not None else None
    self.train(X, y, training_frame, **params)
    return self

[docs]  def get_params(self, deep=True):
    """Useful method for obtaining parameters for this estimator. Used primarily for
    sklearn Pipelines and sklearn grid search.

    Parameters
    ----------
      deep : bool, optional
        If True, return parameters of all sub-objects that are estimators.

    Returns
    -------
      A dict of parameters
    """
    out = dict()
    for key,value in self.parms.items():
      if deep and isinstance(value, H2OEstimator):
        deep_items = list(value.get_params().items())
        out.update((key + '__' + k, val) for k, val in deep_items)
      out[key] = value
    return out

[docs]  def set_params(self, **parms):
    """Used by sklearn for updating parameters during grid search.

    Parameters
    ----------
      parms : dict
        A dictionary of parameters that will be set on this model.

    Returns
    -------
      Returns self, the current estimator object with the parameters all set as desired.
    """
    self._parms.update(parms)
    return self

  @staticmethod
  def _metrics_class(model_json):
    model_type = model_json["output"]["model_category"]
    if model_type=="Binomial":       metrics_class = H2OBinomialModelMetrics;      model_class = H2OBinomialModel
    elif model_type=="Clustering":   metrics_class = H2OClusteringModelMetrics;    model_class = H2OClusteringModel
    elif model_type=="Regression":   metrics_class = H2ORegressionModelMetrics;    model_class = H2ORegressionModel
    elif model_type=="Multinomial":  metrics_class = H2OMultinomialModelMetrics;   model_class = H2OMultinomialModel
    elif model_type=="AutoEncoder":  metrics_class = H2OAutoEncoderModelMetrics;   model_class = H2OAutoEncoderModel
    elif model_type=="DimReduction": metrics_class = H2ODimReductionModelMetrics;  model_class = H2ODimReductionModel
    else: raise NotImplementedError(model_type)
    return [metrics_class,model_class]