Source code for h2o.estimators.glm

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#
from .estimator_base import H2OEstimator
from h2o.connection import H2OConnection


[docs]class H2OGeneralizedLinearEstimator(H2OEstimator):
    """
    Generalized Linear Modeling
    ---------------------------
    Fits a generalized linear model, specified by a response variable, a set of predictors, and a
    description of the error distribution.

    Parameters (optional, unless specified otherwise)
    ----------
      model_id : str
        Destination id for this model; auto-generated if not specified.

      training_frame : str
        Id of the training data frame (Not required, to allow initial validation of model parameters).

      validation_frame : str
        Id of the validation data frame.

      nfolds : int
        Number of folds for N-fold cross-validation (0 to disable or ≥ 2).
        Default: 0

      seed : int
        Seed for pseudo random number generator (if applicable)
        Default: -1

      keep_cross_validation_predictions : bool
        Whether to keep the predictions of the cross-validation models.
        Default: False

      keep_cross_validation_fold_assignment : bool
        Whether to keep the cross-validation fold assignment.
        Default: False

      fold_assignment : "AUTO" | "Random" | "Modulo" | "Stratified"
        Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
        the folds based on the response variable, for classification problems.
        Default: "AUTO"

      fold_column : VecSpecifier
        Column with cross-validation fold index assignment per observation.

      response_column : VecSpecifier
        Response variable column.

      ignored_columns : list(str)
        Names of columns to ignore for training.

      ignore_const_cols : bool
        Ignore constant columns.
        Default: True

      score_each_iteration : bool
        Whether to score during each iteration of model training.
        Default: False

      offset_column : VecSpecifier
        Offset column. This will be added to the combination of columns before applying the link function.

      weights_column : VecSpecifier
        Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
        dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
        weights are not allowed.

      family : "gaussian" | "binomial" | "multinomial" | "poisson" | "gamma" | "tweedie"
        Family. Use binomial for classification with logistic regression, others are for regression problems.
        Default: "gaussian"

      tweedie_variance_power : float
        Tweedie variance power
        Default: 0.0

      tweedie_link_power : float
        Tweedie link power
        Default: 1.0

      solver : "AUTO" | "IRLSM" | "L_BFGS" | "COORDINATE_DESCENT_NAIVE" | "COORDINATE_DESCENT"
        AUTO will set the solver based on given data and the other parameters. IRLSM is fast on on problems with small
        number of predictors and for lambda-search with L1 penalty, L_BFGS scales better for datasets with many columns.
        Coordinate descent is experimental (beta).
        Default: "AUTO"

      alpha : list(float)
        distribution of regularization between L1 and L2.

      lambda_ : list(float)
        regularization strength

      lambda_search : bool
        use lambda search starting at lambda max, given lambda is then interpreted as lambda min
        Default: False

      early_stopping : bool
        stop early when there is no more relative improvement on train or validation (if provided)
        Default: True

      nlambdas : int
        number of lambdas to be used in a search
        Default: -1

      standardize : bool
        Standardize numeric columns to have zero mean and unit variance
        Default: True

      missing_values_handling : "Skip" | "MeanImputation"
        Handling of missing values. Either Skip or MeanImputation.
        Default: "MeanImputation"

      compute_p_values : bool
        request p-values computation, p-values work only with IRLSM solver and no regularization
        Default: False

      remove_collinear_columns : bool
        in case of linearly dependent columns remove some of the dependent columns
        Default: False

      intercept : bool
        include constant term in the model
        Default: True

      non_negative : bool
        Restrict coefficients (not intercept) to be non-negative
        Default: False

      max_iterations : int
        Maximum number of iterations
        Default: -1

      objective_epsilon : float
        converge if  objective value changes less than this
        Default: -1.0

      beta_epsilon : float
        converge if  beta changes less (using L-infinity norm) than beta esilon, ONLY applies to IRLSM solver
        Default: 0.0001

      gradient_epsilon : float
        converge if  objective changes less (using L-infinity norm) than this, ONLY applies to L-BFGS solver
        Default: -1.0

      link : "family_default" | "identity" | "logit" | "log" | "inverse" | "tweedie"

        Default: "family_default"

      prior : float
        prior probability for y==1. To be used only for logistic regression iff the data has been sampled and the mean
        of response does not reflect reality.
        Default: -1.0

      lambda_min_ratio : float
        min lambda used in lambda search, specified as a ratio of lambda_max
        Default: -1.0

      beta_constraints : str
        beta constraints

      max_active_predictors : int
        Maximum number of active predictors during computation. Use as a stopping criterium to prevent expensive model
        building with many predictors.
        Default: -1

      interactions : list(str)
        A list of predictor column indices to interact. All pairwise combinations will be computed for the list.

      balance_classes : bool
        Balance training data class counts via over/under-sampling (for imbalanced data).
        Default: False

      class_sampling_factors : list(float)
        Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
        be automatically computed to obtain class balance during training. Requires balance_classes.

      max_after_balance_size : float
        Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires
        balance_classes.
        Default: 5.0

      max_confusion_matrix_size : int
        Maximum size (# classes) for confusion matrices to be printed in the Logs
        Default: 20

      max_hit_ratio_k : int
        Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)
        Default: 0

      max_runtime_secs : float
        Maximum allowed runtime in seconds for model training. Use 0 to disable.
        Default: 0.0

    Returns
    -------
    A subclass of ModelBase is returned. The specific subclass depends on the machine learning task at hand
    (if it's binomial classification, then an H2OBinomialModel is returned, if it's regression then a
    H2ORegressionModel is returned). The default print-out of the models is shown, but further GLM-specific
    information can be queried out of the object. Upon completion of the GLM, the resulting object has
    coefficients, normalized coefficients, residual/null deviance, aic, and a host of model metrics including
    MSE, AUC (for logistic regression), degrees of freedom, and confusion matrices.
    """
    def __init__(self, **kwargs):
        super(H2OGeneralizedLinearEstimator, self).__init__()
        self._parms = {}
        for name in ["model_id", "training_frame", "validation_frame", "nfolds", "seed",
                     "keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "fold_assignment",
                     "fold_column", "response_column", "ignored_columns", "ignore_const_cols", "score_each_iteration",
                     "offset_column", "weights_column", "family", "tweedie_variance_power", "tweedie_link_power",
                     "solver", "alpha", "lambda_", "lambda_search", "early_stopping", "nlambdas", "standardize",
                     "missing_values_handling", "compute_p_values", "remove_collinear_columns", "intercept",
                     "non_negative", "max_iterations", "objective_epsilon", "beta_epsilon", "gradient_epsilon", "link",
                     "prior", "lambda_min_ratio", "beta_constraints", "max_active_predictors", "interactions",
                     "balance_classes", "class_sampling_factors", "max_after_balance_size", "max_confusion_matrix_size",
                     "max_hit_ratio_k", "max_runtime_secs"]:
            pname = name[:-1] if name[-1] == '_' else name
            self._parms[pname] = kwargs[name] if name in kwargs else None
        if "Lambda" in kwargs:
            self._parms["lambda"] = kwargs["Lambda"]

    @property
    def training_frame(self):
        return self._parms["training_frame"]

    @training_frame.setter
    def training_frame(self, value):
        self._parms["training_frame"] = value

    @property
    def validation_frame(self):
        return self._parms["validation_frame"]

    @validation_frame.setter
    def validation_frame(self, value):
        self._parms["validation_frame"] = value

    @property
    def nfolds(self):
        return self._parms["nfolds"]

    @nfolds.setter
    def nfolds(self, value):
        self._parms["nfolds"] = value

    @property
    def seed(self):
        return self._parms["seed"]

    @seed.setter
    def seed(self, value):
        self._parms["seed"] = value

    @property
    def keep_cross_validation_predictions(self):
        return self._parms["keep_cross_validation_predictions"]

    @keep_cross_validation_predictions.setter
    def keep_cross_validation_predictions(self, value):
        self._parms["keep_cross_validation_predictions"] = value

    @property
    def keep_cross_validation_fold_assignment(self):
        return self._parms["keep_cross_validation_fold_assignment"]

    @keep_cross_validation_fold_assignment.setter
    def keep_cross_validation_fold_assignment(self, value):
        self._parms["keep_cross_validation_fold_assignment"] = value

    @property
    def fold_assignment(self):
        return self._parms["fold_assignment"]

    @fold_assignment.setter
    def fold_assignment(self, value):
        self._parms["fold_assignment"] = value

    @property
    def fold_column(self):
        return self._parms["fold_column"]

    @fold_column.setter
    def fold_column(self, value):
        self._parms["fold_column"] = value

    @property
    def response_column(self):
        return self._parms["response_column"]

    @response_column.setter
    def response_column(self, value):
        self._parms["response_column"] = value

    @property
    def ignored_columns(self):
        return self._parms["ignored_columns"]

    @ignored_columns.setter
    def ignored_columns(self, value):
        self._parms["ignored_columns"] = value

    @property
    def ignore_const_cols(self):
        return self._parms["ignore_const_cols"]

    @ignore_const_cols.setter
    def ignore_const_cols(self, value):
        self._parms["ignore_const_cols"] = value

    @property
    def score_each_iteration(self):
        return self._parms["score_each_iteration"]

    @score_each_iteration.setter
    def score_each_iteration(self, value):
        self._parms["score_each_iteration"] = value

    @property
    def offset_column(self):
        return self._parms["offset_column"]

    @offset_column.setter
    def offset_column(self, value):
        self._parms["offset_column"] = value

    @property
    def weights_column(self):
        return self._parms["weights_column"]

    @weights_column.setter
    def weights_column(self, value):
        self._parms["weights_column"] = value

    @property
    def family(self):
        return self._parms["family"]

    @family.setter
    def family(self, value):
        self._parms["family"] = value

    @property
    def tweedie_variance_power(self):
        return self._parms["tweedie_variance_power"]

    @tweedie_variance_power.setter
    def tweedie_variance_power(self, value):
        self._parms["tweedie_variance_power"] = value

    @property
    def tweedie_link_power(self):
        return self._parms["tweedie_link_power"]

    @tweedie_link_power.setter
    def tweedie_link_power(self, value):
        self._parms["tweedie_link_power"] = value

    @property
    def solver(self):
        return self._parms["solver"]

    @solver.setter
    def solver(self, value):
        self._parms["solver"] = value

    @property
    def alpha(self):
        return self._parms["alpha"]

    @alpha.setter
    def alpha(self, value):
        self._parms["alpha"] = value

    @property
    def lambda_(self):
        return self._parms["lambda"]

    @lambda_.setter
    def lambda_(self, value):
        self._parms["lambda"] = value

    @property
    def lambda_search(self):
        return self._parms["lambda_search"]

    @lambda_search.setter
    def lambda_search(self, value):
        self._parms["lambda_search"] = value

    @property
    def early_stopping(self):
        return self._parms["early_stopping"]

    @early_stopping.setter
    def early_stopping(self, value):
        self._parms["early_stopping"] = value

    @property
    def nlambdas(self):
        return self._parms["nlambdas"]

    @nlambdas.setter
    def nlambdas(self, value):
        self._parms["nlambdas"] = value

    @property
    def standardize(self):
        return self._parms["standardize"]

    @standardize.setter
    def standardize(self, value):
        self._parms["standardize"] = value

    @property
    def missing_values_handling(self):
        return self._parms["missing_values_handling"]

    @missing_values_handling.setter
    def missing_values_handling(self, value):
        self._parms["missing_values_handling"] = value

    @property
    def compute_p_values(self):
        return self._parms["compute_p_values"]

    @compute_p_values.setter
    def compute_p_values(self, value):
        self._parms["compute_p_values"] = value

    @property
    def remove_collinear_columns(self):
        return self._parms["remove_collinear_columns"]

    @remove_collinear_columns.setter
    def remove_collinear_columns(self, value):
        self._parms["remove_collinear_columns"] = value

    @property
    def intercept(self):
        return self._parms["intercept"]

    @intercept.setter
    def intercept(self, value):
        self._parms["intercept"] = value

    @property
    def non_negative(self):
        return self._parms["non_negative"]

    @non_negative.setter
    def non_negative(self, value):
        self._parms["non_negative"] = value

    @property
    def max_iterations(self):
        return self._parms["max_iterations"]

    @max_iterations.setter
    def max_iterations(self, value):
        self._parms["max_iterations"] = value

    @property
    def objective_epsilon(self):
        return self._parms["objective_epsilon"]

    @objective_epsilon.setter
    def objective_epsilon(self, value):
        self._parms["objective_epsilon"] = value

    @property
    def beta_epsilon(self):
        return self._parms["beta_epsilon"]

    @beta_epsilon.setter
    def beta_epsilon(self, value):
        self._parms["beta_epsilon"] = value

    @property
    def gradient_epsilon(self):
        return self._parms["gradient_epsilon"]

    @gradient_epsilon.setter
    def gradient_epsilon(self, value):
        self._parms["gradient_epsilon"] = value

    @property
    def link(self):
        return self._parms["link"]

    @link.setter
    def link(self, value):
        self._parms["link"] = value

    @property
    def prior(self):
        return self._parms["prior"]

    @prior.setter
    def prior(self, value):
        self._parms["prior"] = value

    @property
    def lambda_min_ratio(self):
        return self._parms["lambda_min_ratio"]

    @lambda_min_ratio.setter
    def lambda_min_ratio(self, value):
        self._parms["lambda_min_ratio"] = value

    @property
    def beta_constraints(self):
        return self._parms["beta_constraints"]

    @beta_constraints.setter
    def beta_constraints(self, value):
        self._parms["beta_constraints"] = value

    @property
    def max_active_predictors(self):
        return self._parms["max_active_predictors"]

    @max_active_predictors.setter
    def max_active_predictors(self, value):
        self._parms["max_active_predictors"] = value

    @property
    def interactions(self):
        return self._parms["interactions"]

    @interactions.setter
    def interactions(self, value):
        self._parms["interactions"] = value

    @property
    def balance_classes(self):
        return self._parms["balance_classes"]

    @balance_classes.setter
    def balance_classes(self, value):
        self._parms["balance_classes"] = value

    @property
    def class_sampling_factors(self):
        return self._parms["class_sampling_factors"]

    @class_sampling_factors.setter
    def class_sampling_factors(self, value):
        self._parms["class_sampling_factors"] = value

    @property
    def max_after_balance_size(self):
        return self._parms["max_after_balance_size"]

    @max_after_balance_size.setter
    def max_after_balance_size(self, value):
        self._parms["max_after_balance_size"] = value

    @property
    def max_confusion_matrix_size(self):
        return self._parms["max_confusion_matrix_size"]

    @max_confusion_matrix_size.setter
    def max_confusion_matrix_size(self, value):
        self._parms["max_confusion_matrix_size"] = value

    @property
    def max_hit_ratio_k(self):
        return self._parms["max_hit_ratio_k"]

    @max_hit_ratio_k.setter
    def max_hit_ratio_k(self, value):
        self._parms["max_hit_ratio_k"] = value

    @property
    def max_runtime_secs(self):
        return self._parms["max_runtime_secs"]

    @max_runtime_secs.setter
    def max_runtime_secs(self, value):
        self._parms["max_runtime_secs"] = value

    @property
    def Lambda(self):
        """[DEPRECATED] Use self.lambda_ instead"""
        return self._parms["lambda"] if "lambda" in self._parms else None
    
    @Lambda.setter
    def lambda_(self, value):
        """[DEPRECATED] Use self.lambda_ instead"""
        self._parms["lambda"] = value
    
    @staticmethod
[docs]    def getGLMRegularizationPath(model):
        """
        Extract full regularization path explored during lambda search from glm model.
        @param model - source lambda search model
        """
        x = H2OConnection.get_json("GetGLMRegPath", model=model._model_json["model_id"]["name"])
        ns = x.pop("coefficient_names")
        res = {
            "lambdas": x["lambdas"],
            "explained_deviance_train": x["explained_deviance_train"],
            "explained_deviance_valid": x["explained_deviance_valid"],
            "coefficients": [dict(zip(ns,y)) for y in x["coefficients"]],
        }
        if "coefficients_std" in x:
            res["coefficients_std"] = [dict(zip(ns,y)) for y in x["coefficients_std"]]
        return res
    
    @staticmethod
[docs]    def makeGLMModel(model, coefs, threshold=.5):
        """
        Create a custom GLM model using the given coefficients.
        Needs to be passed source model trained on the dataset to extract the dataset information from.
          @param model - source model, used for extracting dataset information
          @param coefs - dictionary containing model coefficients
          @param threshold - (optional, only for binomial) decision threshold used for classification
        """
        model_json = H2OConnection.post_json("MakeGLMModel", model=model._model_json["model_id"]["name"],
            names=list(coefs.keys()), beta=list(coefs.values()), threshold=threshold)
        m = H2OGeneralizedLinearEstimator()
        m._resolve_model(model_json["model_id"]["name"], model_json)
        return m