Source code for h2o.estimators.glm

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#
from .estimator_base import H2OEstimator
from h2o.connection import H2OConnection


[docs]class H2OGeneralizedLinearEstimator(H2OEstimator): """ Generalized Linear Modeling --------------------------- Fits a generalized linear model, specified by a response variable, a set of predictors, and a description of the error distribution. Parameters (optional, unless specified otherwise) ---------- model_id : str Destination id for this model; auto-generated if not specified. training_frame : str Id of the training data frame (Not required, to allow initial validation of model parameters). validation_frame : str Id of the validation data frame. nfolds : int Number of folds for N-fold cross-validation (0 to disable or ≥ 2). Default: 0 seed : int Seed for pseudo random number generator (if applicable) Default: -1 keep_cross_validation_predictions : bool Whether to keep the predictions of the cross-validation models. Default: False keep_cross_validation_fold_assignment : bool Whether to keep the cross-validation fold assignment. Default: False fold_assignment : "AUTO" | "Random" | "Modulo" | "Stratified" Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Default: "AUTO" fold_column : VecSpecifier Column with cross-validation fold index assignment per observation. response_column : VecSpecifier Response variable column. ignored_columns : list(str) Names of columns to ignore for training. ignore_const_cols : bool Ignore constant columns. Default: True score_each_iteration : bool Whether to score during each iteration of model training. Default: False offset_column : VecSpecifier Offset column. This will be added to the combination of columns before applying the link function. weights_column : VecSpecifier Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. family : "gaussian" | "binomial" | "multinomial" | "poisson" | "gamma" | "tweedie" Family. Use binomial for classification with logistic regression, others are for regression problems. Default: "gaussian" tweedie_variance_power : float Tweedie variance power Default: 0.0 tweedie_link_power : float Tweedie link power Default: 1.0 solver : "AUTO" | "IRLSM" | "L_BFGS" | "COORDINATE_DESCENT_NAIVE" | "COORDINATE_DESCENT" AUTO will set the solver based on given data and the other parameters. IRLSM is fast on on problems with small number of predictors and for lambda-search with L1 penalty, L_BFGS scales better for datasets with many columns. Coordinate descent is experimental (beta). Default: "AUTO" alpha : list(float) distribution of regularization between L1 and L2. lambda_ : list(float) regularization strength lambda_search : bool use lambda search starting at lambda max, given lambda is then interpreted as lambda min Default: False early_stopping : bool stop early when there is no more relative improvement on train or validation (if provided) Default: True nlambdas : int number of lambdas to be used in a search Default: -1 standardize : bool Standardize numeric columns to have zero mean and unit variance Default: True missing_values_handling : "Skip" | "MeanImputation" Handling of missing values. Either Skip or MeanImputation. Default: "MeanImputation" compute_p_values : bool request p-values computation, p-values work only with IRLSM solver and no regularization Default: False remove_collinear_columns : bool in case of linearly dependent columns remove some of the dependent columns Default: False intercept : bool include constant term in the model Default: True non_negative : bool Restrict coefficients (not intercept) to be non-negative Default: False max_iterations : int Maximum number of iterations Default: -1 objective_epsilon : float converge if objective value changes less than this Default: -1.0 beta_epsilon : float converge if beta changes less (using L-infinity norm) than beta esilon, ONLY applies to IRLSM solver Default: 0.0001 gradient_epsilon : float converge if objective changes less (using L-infinity norm) than this, ONLY applies to L-BFGS solver Default: -1.0 link : "family_default" | "identity" | "logit" | "log" | "inverse" | "tweedie" Default: "family_default" prior : float prior probability for y==1. To be used only for logistic regression iff the data has been sampled and the mean of response does not reflect reality. Default: -1.0 lambda_min_ratio : float min lambda used in lambda search, specified as a ratio of lambda_max Default: -1.0 beta_constraints : str beta constraints max_active_predictors : int Maximum number of active predictors during computation. Use as a stopping criterium to prevent expensive model building with many predictors. Default: -1 interactions : list(str) A list of predictor column indices to interact. All pairwise combinations will be computed for the list. balance_classes : bool Balance training data class counts via over/under-sampling (for imbalanced data). Default: False class_sampling_factors : list(float) Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes. max_after_balance_size : float Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes. Default: 5.0 max_confusion_matrix_size : int Maximum size (# classes) for confusion matrices to be printed in the Logs Default: 20 max_hit_ratio_k : int Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable) Default: 0 max_runtime_secs : float Maximum allowed runtime in seconds for model training. Use 0 to disable. Default: 0.0 Returns ------- A subclass of ModelBase is returned. The specific subclass depends on the machine learning task at hand (if it's binomial classification, then an H2OBinomialModel is returned, if it's regression then a H2ORegressionModel is returned). The default print-out of the models is shown, but further GLM-specific information can be queried out of the object. Upon completion of the GLM, the resulting object has coefficients, normalized coefficients, residual/null deviance, aic, and a host of model metrics including MSE, AUC (for logistic regression), degrees of freedom, and confusion matrices. """ def __init__(self, **kwargs): super(H2OGeneralizedLinearEstimator, self).__init__() self._parms = {} for name in ["model_id", "training_frame", "validation_frame", "nfolds", "seed", "keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "fold_assignment", "fold_column", "response_column", "ignored_columns", "ignore_const_cols", "score_each_iteration", "offset_column", "weights_column", "family", "tweedie_variance_power", "tweedie_link_power", "solver", "alpha", "lambda_", "lambda_search", "early_stopping", "nlambdas", "standardize", "missing_values_handling", "compute_p_values", "remove_collinear_columns", "intercept", "non_negative", "max_iterations", "objective_epsilon", "beta_epsilon", "gradient_epsilon", "link", "prior", "lambda_min_ratio", "beta_constraints", "max_active_predictors", "interactions", "balance_classes", "class_sampling_factors", "max_after_balance_size", "max_confusion_matrix_size", "max_hit_ratio_k", "max_runtime_secs"]: pname = name[:-1] if name[-1] == '_' else name self._parms[pname] = kwargs[name] if name in kwargs else None if "Lambda" in kwargs: self._parms["lambda"] = kwargs["Lambda"] @property def training_frame(self): return self._parms["training_frame"] @training_frame.setter def training_frame(self, value): self._parms["training_frame"] = value @property def validation_frame(self): return self._parms["validation_frame"] @validation_frame.setter def validation_frame(self, value): self._parms["validation_frame"] = value @property def nfolds(self): return self._parms["nfolds"] @nfolds.setter def nfolds(self, value): self._parms["nfolds"] = value @property def seed(self): return self._parms["seed"] @seed.setter def seed(self, value): self._parms["seed"] = value @property def keep_cross_validation_predictions(self): return self._parms["keep_cross_validation_predictions"] @keep_cross_validation_predictions.setter def keep_cross_validation_predictions(self, value): self._parms["keep_cross_validation_predictions"] = value @property def keep_cross_validation_fold_assignment(self): return self._parms["keep_cross_validation_fold_assignment"] @keep_cross_validation_fold_assignment.setter def keep_cross_validation_fold_assignment(self, value): self._parms["keep_cross_validation_fold_assignment"] = value @property def fold_assignment(self): return self._parms["fold_assignment"] @fold_assignment.setter def fold_assignment(self, value): self._parms["fold_assignment"] = value @property def fold_column(self): return self._parms["fold_column"] @fold_column.setter def fold_column(self, value): self._parms["fold_column"] = value @property def response_column(self): return self._parms["response_column"] @response_column.setter def response_column(self, value): self._parms["response_column"] = value @property def ignored_columns(self): return self._parms["ignored_columns"] @ignored_columns.setter def ignored_columns(self, value): self._parms["ignored_columns"] = value @property def ignore_const_cols(self): return self._parms["ignore_const_cols"] @ignore_const_cols.setter def ignore_const_cols(self, value): self._parms["ignore_const_cols"] = value @property def score_each_iteration(self): return self._parms["score_each_iteration"] @score_each_iteration.setter def score_each_iteration(self, value): self._parms["score_each_iteration"] = value @property def offset_column(self): return self._parms["offset_column"] @offset_column.setter def offset_column(self, value): self._parms["offset_column"] = value @property def weights_column(self): return self._parms["weights_column"] @weights_column.setter def weights_column(self, value): self._parms["weights_column"] = value @property def family(self): return self._parms["family"] @family.setter def family(self, value): self._parms["family"] = value @property def tweedie_variance_power(self): return self._parms["tweedie_variance_power"] @tweedie_variance_power.setter def tweedie_variance_power(self, value): self._parms["tweedie_variance_power"] = value @property def tweedie_link_power(self): return self._parms["tweedie_link_power"] @tweedie_link_power.setter def tweedie_link_power(self, value): self._parms["tweedie_link_power"] = value @property def solver(self): return self._parms["solver"] @solver.setter def solver(self, value): self._parms["solver"] = value @property def alpha(self): return self._parms["alpha"] @alpha.setter def alpha(self, value): self._parms["alpha"] = value @property def lambda_(self): return self._parms["lambda"] @lambda_.setter def lambda_(self, value): self._parms["lambda"] = value @property def lambda_search(self): return self._parms["lambda_search"] @lambda_search.setter def lambda_search(self, value): self._parms["lambda_search"] = value @property def early_stopping(self): return self._parms["early_stopping"] @early_stopping.setter def early_stopping(self, value): self._parms["early_stopping"] = value @property def nlambdas(self): return self._parms["nlambdas"] @nlambdas.setter def nlambdas(self, value): self._parms["nlambdas"] = value @property def standardize(self): return self._parms["standardize"] @standardize.setter def standardize(self, value): self._parms["standardize"] = value @property def missing_values_handling(self): return self._parms["missing_values_handling"] @missing_values_handling.setter def missing_values_handling(self, value): self._parms["missing_values_handling"] = value @property def compute_p_values(self): return self._parms["compute_p_values"] @compute_p_values.setter def compute_p_values(self, value): self._parms["compute_p_values"] = value @property def remove_collinear_columns(self): return self._parms["remove_collinear_columns"] @remove_collinear_columns.setter def remove_collinear_columns(self, value): self._parms["remove_collinear_columns"] = value @property def intercept(self): return self._parms["intercept"] @intercept.setter def intercept(self, value): self._parms["intercept"] = value @property def non_negative(self): return self._parms["non_negative"] @non_negative.setter def non_negative(self, value): self._parms["non_negative"] = value @property def max_iterations(self): return self._parms["max_iterations"] @max_iterations.setter def max_iterations(self, value): self._parms["max_iterations"] = value @property def objective_epsilon(self): return self._parms["objective_epsilon"] @objective_epsilon.setter def objective_epsilon(self, value): self._parms["objective_epsilon"] = value @property def beta_epsilon(self): return self._parms["beta_epsilon"] @beta_epsilon.setter def beta_epsilon(self, value): self._parms["beta_epsilon"] = value @property def gradient_epsilon(self): return self._parms["gradient_epsilon"] @gradient_epsilon.setter def gradient_epsilon(self, value): self._parms["gradient_epsilon"] = value @property def link(self): return self._parms["link"] @link.setter def link(self, value): self._parms["link"] = value @property def prior(self): return self._parms["prior"] @prior.setter def prior(self, value): self._parms["prior"] = value @property def lambda_min_ratio(self): return self._parms["lambda_min_ratio"] @lambda_min_ratio.setter def lambda_min_ratio(self, value): self._parms["lambda_min_ratio"] = value @property def beta_constraints(self): return self._parms["beta_constraints"] @beta_constraints.setter def beta_constraints(self, value): self._parms["beta_constraints"] = value @property def max_active_predictors(self): return self._parms["max_active_predictors"] @max_active_predictors.setter def max_active_predictors(self, value): self._parms["max_active_predictors"] = value @property def interactions(self): return self._parms["interactions"] @interactions.setter def interactions(self, value): self._parms["interactions"] = value @property def balance_classes(self): return self._parms["balance_classes"] @balance_classes.setter def balance_classes(self, value): self._parms["balance_classes"] = value @property def class_sampling_factors(self): return self._parms["class_sampling_factors"] @class_sampling_factors.setter def class_sampling_factors(self, value): self._parms["class_sampling_factors"] = value @property def max_after_balance_size(self): return self._parms["max_after_balance_size"] @max_after_balance_size.setter def max_after_balance_size(self, value): self._parms["max_after_balance_size"] = value @property def max_confusion_matrix_size(self): return self._parms["max_confusion_matrix_size"] @max_confusion_matrix_size.setter def max_confusion_matrix_size(self, value): self._parms["max_confusion_matrix_size"] = value @property def max_hit_ratio_k(self): return self._parms["max_hit_ratio_k"] @max_hit_ratio_k.setter def max_hit_ratio_k(self, value): self._parms["max_hit_ratio_k"] = value @property def max_runtime_secs(self): return self._parms["max_runtime_secs"] @max_runtime_secs.setter def max_runtime_secs(self, value): self._parms["max_runtime_secs"] = value @property def Lambda(self): """[DEPRECATED] Use self.lambda_ instead""" return self._parms["lambda"] if "lambda" in self._parms else None @Lambda.setter def lambda_(self, value): """[DEPRECATED] Use self.lambda_ instead""" self._parms["lambda"] = value @staticmethod
[docs] def getGLMRegularizationPath(model): """ Extract full regularization path explored during lambda search from glm model. @param model - source lambda search model """ x = H2OConnection.get_json("GetGLMRegPath", model=model._model_json["model_id"]["name"]) ns = x.pop("coefficient_names") res = { "lambdas": x["lambdas"], "explained_deviance_train": x["explained_deviance_train"], "explained_deviance_valid": x["explained_deviance_valid"], "coefficients": [dict(zip(ns,y)) for y in x["coefficients"]], } if "coefficients_std" in x: res["coefficients_std"] = [dict(zip(ns,y)) for y in x["coefficients_std"]] return res
@staticmethod
[docs] def makeGLMModel(model, coefs, threshold=.5): """ Create a custom GLM model using the given coefficients. Needs to be passed source model trained on the dataset to extract the dataset information from. @param model - source model, used for extracting dataset information @param coefs - dictionary containing model coefficients @param threshold - (optional, only for binomial) decision threshold used for classification """ model_json = H2OConnection.post_json("MakeGLMModel", model=model._model_json["model_id"]["name"], names=list(coefs.keys()), beta=list(coefs.values()), threshold=threshold) m = H2OGeneralizedLinearEstimator() m._resolve_model(model_json["model_id"]["name"], model_json) return m