Source code for h2o.estimators.hglm

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OHGLMEstimator(H2OEstimator): """ Hierarchical Generalized Linear Model Fits a HGLM model with both the residual noise and random effect being modeled by Gaussian distribution. The fixed effect coefficients are specified in parameter x, the random effect coefficients are specified in parameter random_columns. The column specified in group_column will contain the level 2 index value and must be an enum column. """ algo = "hglm" supervised_learning = True def __init__(self, model_id=None, # type: Optional[Union[None, str, H2OEstimator]] training_frame=None, # type: Optional[Union[None, str, H2OFrame]] validation_frame=None, # type: Optional[Union[None, str, H2OFrame]] response_column=None, # type: Optional[str] ignored_columns=None, # type: Optional[List[str]] ignore_const_cols=True, # type: bool offset_column=None, # type: Optional[str] weights_column=None, # type: Optional[str] max_runtime_secs=0.0, # type: float custom_metric_func=None, # type: Optional[str] score_each_iteration=False, # type: bool score_iteration_interval=5, # type: int seed=-1, # type: int missing_values_handling="mean_imputation", # type: Literal["mean_imputation", "skip", "plug_values"] plug_values=None, # type: Optional[Union[None, str, H2OFrame]] family="gaussian", # type: Literal["gaussian"] rand_family=None, # type: Optional[Literal["gaussian"]] max_iterations=-1, # type: int initial_fixed_effects=None, # type: Optional[List[float]] initial_random_effects=None, # type: Optional[Union[None, str, H2OFrame]] initial_t_matrix=None, # type: Optional[Union[None, str, H2OFrame]] tau_u_var_init=0.0, # type: float tau_e_var_init=0.0, # type: float random_columns=None, # type: Optional[List[str]] method="em", # type: Literal["em"] em_epsilon=0.001, # type: float random_intercept=True, # type: bool group_column=None, # type: Optional[str] gen_syn_data=False, # type: bool ): """ :param model_id: Destination id for this model; auto-generated if not specified. Defaults to ``None``. :type model_id: Union[None, str, H2OEstimator], optional :param training_frame: Id of the training data frame. Defaults to ``None``. :type training_frame: Union[None, str, H2OFrame], optional :param validation_frame: Id of the validation data frame. Defaults to ``None``. :type validation_frame: Union[None, str, H2OFrame], optional :param response_column: Response variable column. Defaults to ``None``. :type response_column: str, optional :param ignored_columns: Names of columns to ignore for training. Defaults to ``None``. :type ignored_columns: List[str], optional :param ignore_const_cols: Ignore constant columns. Defaults to ``True``. :type ignore_const_cols: bool :param offset_column: Offset column. This will be added to the combination of columns before applying the link function. Defaults to ``None``. :type offset_column: str, optional :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Defaults to ``None``. :type weights_column: str, optional :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to ``0.0``. :type max_runtime_secs: float :param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName` Defaults to ``None``. :type custom_metric_func: str, optional :param score_each_iteration: Whether to score during each iteration of model training. Defaults to ``False``. :type score_each_iteration: bool :param score_iteration_interval: Perform scoring for every score_iteration_interval iterations. Defaults to ``5``. :type score_iteration_interval: int :param seed: Seed for pseudo random number generator (if applicable). Defaults to ``-1``. :type seed: int :param missing_values_handling: Handling of missing values. Either MeanImputation, Skip or PlugValues. Defaults to ``"mean_imputation"``. :type missing_values_handling: Literal["mean_imputation", "skip", "plug_values"] :param plug_values: Plug Values (a single row frame containing values that will be used to impute missing values of the training/validation frame, use with conjunction missing_values_handling = PlugValues). Defaults to ``None``. :type plug_values: Union[None, str, H2OFrame], optional :param family: Family. Only gaussian is supported now. Defaults to ``"gaussian"``. :type family: Literal["gaussian"] :param rand_family: Set distribution of random effects. Only Gaussian is implemented now. Defaults to ``None``. :type rand_family: Literal["gaussian"], optional :param max_iterations: Maximum number of iterations. Value should >=1. A value of 0 is only set when only the model coefficient names and model coefficient dimensions are needed. Defaults to ``-1``. :type max_iterations: int :param initial_fixed_effects: An array that contains initial values of the fixed effects coefficient. Defaults to ``None``. :type initial_fixed_effects: List[float], optional :param initial_random_effects: A H2OFrame id that contains initial values of the random effects coefficient. The row names shouldbe the random coefficient names. If you are not sure what the random coefficient names are, build HGLM model with max_iterations = 0 and checkout the model output field random_coefficient_names. The number of rows of this frame should be the number of level 2 units. Again, to figure this out, build HGLM model with max_iterations=0 and check out the model output field group_column_names. The number of rows should equal the length of thegroup_column_names. Defaults to ``None``. :type initial_random_effects: Union[None, str, H2OFrame], optional :param initial_t_matrix: A H2OFrame id that contains initial values of the T matrix. It should be a positive symmetric matrix. Defaults to ``None``. :type initial_t_matrix: Union[None, str, H2OFrame], optional :param tau_u_var_init: Initial variance of random coefficient effects. If set, should provide a value > 0.0. If not set, will be randomly set in the model building process. Defaults to ``0.0``. :type tau_u_var_init: float :param tau_e_var_init: Initial variance of random noise. If set, should provide a value > 0.0. If not set, will be randomly set in the model building process. Defaults to ``0.0``. :type tau_e_var_init: float :param random_columns: Random columns indices for HGLM. Defaults to ``None``. :type random_columns: List[str], optional :param method: We only implemented EM as a method to obtain the fixed, random coefficients and the various variances. Defaults to ``"em"``. :type method: Literal["em"] :param em_epsilon: Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY applies to EM method. Defaults to ``0.001``. :type em_epsilon: float :param random_intercept: If true, will allow random component to the GLM coefficients. Defaults to ``True``. :type random_intercept: bool :param group_column: Group column is the column that is categorical and used to generate the groups in HGLM Defaults to ``None``. :type group_column: str, optional :param gen_syn_data: If true, add gaussian noise with variance specified in parms._tau_e_var_init. Defaults to ``False``. :type gen_syn_data: bool """ super(H2OHGLMEstimator, self).__init__() self._parms = {} self._id = self._parms['model_id'] = model_id self.training_frame = training_frame self.validation_frame = validation_frame self.response_column = response_column self.ignored_columns = ignored_columns self.ignore_const_cols = ignore_const_cols self.offset_column = offset_column self.weights_column = weights_column self.max_runtime_secs = max_runtime_secs self.custom_metric_func = custom_metric_func self.score_each_iteration = score_each_iteration self.score_iteration_interval = score_iteration_interval self.seed = seed self.missing_values_handling = missing_values_handling self.plug_values = plug_values self.family = family self.rand_family = rand_family self.max_iterations = max_iterations self.initial_fixed_effects = initial_fixed_effects self.initial_random_effects = initial_random_effects self.initial_t_matrix = initial_t_matrix self.tau_u_var_init = tau_u_var_init self.tau_e_var_init = tau_e_var_init self.random_columns = random_columns self.method = method self.em_epsilon = em_epsilon self.random_intercept = random_intercept self.group_column = group_column self.gen_syn_data = gen_syn_data @property def training_frame(self): """ Id of the training data frame. Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def validation_frame(self): """ Id of the validation data frame. Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("validation_frame") @validation_frame.setter def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') @property def response_column(self): """ Response variable column. Type: ``str``. """ return self._parms.get("response_column") @response_column.setter def response_column(self, response_column): assert_is_type(response_column, None, str) self._parms["response_column"] = response_column @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def ignore_const_cols(self): """ Ignore constant columns. Type: ``bool``, defaults to ``True``. """ return self._parms.get("ignore_const_cols") @ignore_const_cols.setter def ignore_const_cols(self, ignore_const_cols): assert_is_type(ignore_const_cols, None, bool) self._parms["ignore_const_cols"] = ignore_const_cols @property def offset_column(self): """ Offset column. This will be added to the combination of columns before applying the link function. Type: ``str``. """ return self._parms.get("offset_column") @offset_column.setter def offset_column(self, offset_column): assert_is_type(offset_column, None, str) self._parms["offset_column"] = offset_column @property def weights_column(self): """ Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Type: ``str``. """ return self._parms.get("weights_column") @weights_column.setter def weights_column(self, weights_column): assert_is_type(weights_column, None, str) self._parms["weights_column"] = weights_column @property def max_runtime_secs(self): """ Maximum allowed runtime in seconds for model training. Use 0 to disable. Type: ``float``, defaults to ``0.0``. """ return self._parms.get("max_runtime_secs") @max_runtime_secs.setter def max_runtime_secs(self, max_runtime_secs): assert_is_type(max_runtime_secs, None, numeric) self._parms["max_runtime_secs"] = max_runtime_secs @property def custom_metric_func(self): """ Reference to custom evaluation function, format: `language:keyName=funcName` Type: ``str``. """ return self._parms.get("custom_metric_func") @custom_metric_func.setter def custom_metric_func(self, custom_metric_func): assert_is_type(custom_metric_func, None, str) self._parms["custom_metric_func"] = custom_metric_func @property def score_each_iteration(self): """ Whether to score during each iteration of model training. Type: ``bool``, defaults to ``False``. """ return self._parms.get("score_each_iteration") @score_each_iteration.setter def score_each_iteration(self, score_each_iteration): assert_is_type(score_each_iteration, None, bool) self._parms["score_each_iteration"] = score_each_iteration @property def score_iteration_interval(self): """ Perform scoring for every score_iteration_interval iterations. Type: ``int``, defaults to ``5``. """ return self._parms.get("score_iteration_interval") @score_iteration_interval.setter def score_iteration_interval(self, score_iteration_interval): assert_is_type(score_iteration_interval, None, int) self._parms["score_iteration_interval"] = score_iteration_interval @property def seed(self): """ Seed for pseudo random number generator (if applicable). Type: ``int``, defaults to ``-1``. """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def missing_values_handling(self): """ Handling of missing values. Either MeanImputation, Skip or PlugValues. Type: ``Literal["mean_imputation", "skip", "plug_values"]``, defaults to ``"mean_imputation"``. """ return self._parms.get("missing_values_handling") @missing_values_handling.setter def missing_values_handling(self, missing_values_handling): assert_is_type(missing_values_handling, None, Enum("mean_imputation", "skip", "plug_values")) self._parms["missing_values_handling"] = missing_values_handling @property def plug_values(self): """ Plug Values (a single row frame containing values that will be used to impute missing values of the training/validation frame, use with conjunction missing_values_handling = PlugValues). Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("plug_values") @plug_values.setter def plug_values(self, plug_values): self._parms["plug_values"] = H2OFrame._validate(plug_values, 'plug_values') @property def family(self): """ Family. Only gaussian is supported now. Type: ``Literal["gaussian"]``, defaults to ``"gaussian"``. """ return self._parms.get("family") @family.setter def family(self, family): assert_is_type(family, None, Enum("gaussian")) self._parms["family"] = family @property def rand_family(self): """ Set distribution of random effects. Only Gaussian is implemented now. Type: ``Literal["gaussian"]``. """ return self._parms.get("rand_family") @rand_family.setter def rand_family(self, rand_family): assert_is_type(rand_family, None, Enum("gaussian")) self._parms["rand_family"] = rand_family @property def max_iterations(self): """ Maximum number of iterations. Value should >=1. A value of 0 is only set when only the model coefficient names and model coefficient dimensions are needed. Type: ``int``, defaults to ``-1``. """ return self._parms.get("max_iterations") @max_iterations.setter def max_iterations(self, max_iterations): assert_is_type(max_iterations, None, int) self._parms["max_iterations"] = max_iterations @property def initial_fixed_effects(self): """ An array that contains initial values of the fixed effects coefficient. Type: ``List[float]``. """ return self._parms.get("initial_fixed_effects") @initial_fixed_effects.setter def initial_fixed_effects(self, initial_fixed_effects): assert_is_type(initial_fixed_effects, None, [numeric]) self._parms["initial_fixed_effects"] = initial_fixed_effects @property def initial_random_effects(self): """ A H2OFrame id that contains initial values of the random effects coefficient. The row names shouldbe the random coefficient names. If you are not sure what the random coefficient names are, build HGLM model with max_iterations = 0 and checkout the model output field random_coefficient_names. The number of rows of this frame should be the number of level 2 units. Again, to figure this out, build HGLM model with max_iterations=0 and check out the model output field group_column_names. The number of rows should equal the length of thegroup_column_names. Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("initial_random_effects") @initial_random_effects.setter def initial_random_effects(self, initial_random_effects): self._parms["initial_random_effects"] = H2OFrame._validate(initial_random_effects, 'initial_random_effects') @property def initial_t_matrix(self): """ A H2OFrame id that contains initial values of the T matrix. It should be a positive symmetric matrix. Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("initial_t_matrix") @initial_t_matrix.setter def initial_t_matrix(self, initial_t_matrix): self._parms["initial_t_matrix"] = H2OFrame._validate(initial_t_matrix, 'initial_t_matrix') @property def tau_u_var_init(self): """ Initial variance of random coefficient effects. If set, should provide a value > 0.0. If not set, will be randomly set in the model building process. Type: ``float``, defaults to ``0.0``. """ return self._parms.get("tau_u_var_init") @tau_u_var_init.setter def tau_u_var_init(self, tau_u_var_init): assert_is_type(tau_u_var_init, None, numeric) self._parms["tau_u_var_init"] = tau_u_var_init @property def tau_e_var_init(self): """ Initial variance of random noise. If set, should provide a value > 0.0. If not set, will be randomly set in the model building process. Type: ``float``, defaults to ``0.0``. """ return self._parms.get("tau_e_var_init") @tau_e_var_init.setter def tau_e_var_init(self, tau_e_var_init): assert_is_type(tau_e_var_init, None, numeric) self._parms["tau_e_var_init"] = tau_e_var_init @property def random_columns(self): """ Random columns indices for HGLM. Type: ``List[str]``. :examples: >>> import h2o >>> from h2o.estimators import H2OHGLMEstimator >>> h2o.init() >>> prostate_path <- system.file("extdata", "prostate.csv", package = "h2o") >>> prostate <- h2o.uploadFile(path = prostate_path) >>> prostate$CAPSULE <- as.factor(prostate$CAPSULE) >>> hglm_model =H2OHGLMEstimator(random_columns = ["AGE"], group_column = "RACE") >>> hglm_model.train(x=c("AGE","RACE","DPROS"), y="CAPSULE", training_frame=prostate) """ return self._parms.get("random_columns") @random_columns.setter def random_columns(self, random_columns): assert_is_type(random_columns, None, [str]) self._parms["random_columns"] = random_columns @property def method(self): """ We only implemented EM as a method to obtain the fixed, random coefficients and the various variances. Type: ``Literal["em"]``, defaults to ``"em"``. """ return self._parms.get("method") @method.setter def method(self, method): assert_is_type(method, None, Enum("em")) self._parms["method"] = method @property def em_epsilon(self): """ Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY applies to EM method. Type: ``float``, defaults to ``0.001``. """ return self._parms.get("em_epsilon") @em_epsilon.setter def em_epsilon(self, em_epsilon): assert_is_type(em_epsilon, None, numeric) self._parms["em_epsilon"] = em_epsilon @property def random_intercept(self): """ If true, will allow random component to the GLM coefficients. Type: ``bool``, defaults to ``True``. """ return self._parms.get("random_intercept") @random_intercept.setter def random_intercept(self, random_intercept): assert_is_type(random_intercept, None, bool) self._parms["random_intercept"] = random_intercept @property def group_column(self): """ Group column is the column that is categorical and used to generate the groups in HGLM Type: ``str``. """ return self._parms.get("group_column") @group_column.setter def group_column(self, group_column): assert_is_type(group_column, None, str) self._parms["group_column"] = group_column @property def gen_syn_data(self): """ If true, add gaussian noise with variance specified in parms._tau_e_var_init. Type: ``bool``, defaults to ``False``. """ return self._parms.get("gen_syn_data") @gen_syn_data.setter def gen_syn_data(self, gen_syn_data): assert_is_type(gen_syn_data, None, bool) self._parms["gen_syn_data"] = gen_syn_data
[docs] def level_2_names(self): """ Get the level 2 column values. """ return self._model_json["output"]["group_column_names"]
[docs] def coefs_random_names(self): """ Get the random effect coefficient names including the intercept if applicable. """ return self._model_json["output"]["random_coefficient_names"]
[docs] def coefs_random(self): """ Get the random coefficients of the model. """ level_2_names = self.level_2_names() random_coefs = self._model_json["output"]["ubeta"] return dict(zip(level_2_names, random_coefs))
[docs] def scoring_history_valid(self, as_data_frame=True): """ Retrieve Model Score History for validation data frame if present :returns: The validation score history as an H2OTwoDimTable or a Pandas DataFrame. """ model = self._model_json["output"] if "scoring_history_valid" in model and model["scoring_history_valid"] is not None: if as_data_frame: return model["scoring_history_valid"].as_data_frame() else: return model["scoring_history_valid"] print("No validation scoring history for this model")
[docs] def matrix_T(self): """ retrieve the T matrix estimated for the random effects. The T matrix is the Tj matrix described in section II.I of the doc. :return: The T matrix as a tuple of tuples. """ model = self._model_json["output"] return model["tmat"]
[docs] def residual_variance(self): """ retrieve the residual variance estimate from the model building process. :return: residual variance estiamte as a double """ model = self._model_json["output"] return model["residual_variance"]
[docs] def icc(self): """ retrieve the icc from the model building process. :return: icc as an array """ model = self._model_json["output"] return model["icc"]
[docs] def mean_residual_fixed(self, train = True): """ retrieve the mean residual error using the fixed effect coefficients only. :param train: boolean, if true return result from training frame, else return result from validation frame. :return: mean residual error as a double. """ model = self._model_json["output"] if train: return model["mean_residual_fixed"] else: return model["mean_residual_fixed_valid"]