Source code for h2o.estimators.hglm

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OHGLMEstimator(H2OEstimator):
    """
    Hierarchical Generalized Linear Model

    Fits a HGLM model with both the residual noise and random effect being modeled by Gaussian distribution.  The fixed
    effect coefficients are specified in parameter x, the random effect coefficients are specified in parameter 
    random_columns.  The column specified in group_column will contain the level 2 index value and must be an enum column.
    """

    algo = "hglm"
    supervised_learning = True

    def __init__(self,
                 model_id=None,  # type: Optional[Union[None, str, H2OEstimator]]
                 training_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 validation_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 response_column=None,  # type: Optional[str]
                 ignored_columns=None,  # type: Optional[List[str]]
                 ignore_const_cols=True,  # type: bool
                 offset_column=None,  # type: Optional[str]
                 weights_column=None,  # type: Optional[str]
                 max_runtime_secs=0.0,  # type: float
                 custom_metric_func=None,  # type: Optional[str]
                 score_each_iteration=False,  # type: bool
                 score_iteration_interval=5,  # type: int
                 seed=-1,  # type: int
                 missing_values_handling="mean_imputation",  # type: Literal["mean_imputation", "skip", "plug_values"]
                 plug_values=None,  # type: Optional[Union[None, str, H2OFrame]]
                 family="gaussian",  # type: Literal["gaussian"]
                 rand_family=None,  # type: Optional[Literal["gaussian"]]
                 max_iterations=-1,  # type: int
                 initial_fixed_effects=None,  # type: Optional[List[float]]
                 initial_random_effects=None,  # type: Optional[Union[None, str, H2OFrame]]
                 initial_t_matrix=None,  # type: Optional[Union[None, str, H2OFrame]]
                 tau_u_var_init=0.0,  # type: float
                 tau_e_var_init=0.0,  # type: float
                 random_columns=None,  # type: Optional[List[str]]
                 method="em",  # type: Literal["em"]
                 em_epsilon=0.001,  # type: float
                 random_intercept=True,  # type: bool
                 group_column=None,  # type: Optional[str]
                 gen_syn_data=False,  # type: bool
                 ):
        """
        :param model_id: Destination id for this model; auto-generated if not specified.
               Defaults to ``None``.
        :type model_id: Union[None, str, H2OEstimator], optional
        :param training_frame: Id of the training data frame.
               Defaults to ``None``.
        :type training_frame: Union[None, str, H2OFrame], optional
        :param validation_frame: Id of the validation data frame.
               Defaults to ``None``.
        :type validation_frame: Union[None, str, H2OFrame], optional
        :param response_column: Response variable column.
               Defaults to ``None``.
        :type response_column: str, optional
        :param ignored_columns: Names of columns to ignore for training.
               Defaults to ``None``.
        :type ignored_columns: List[str], optional
        :param ignore_const_cols: Ignore constant columns.
               Defaults to ``True``.
        :type ignore_const_cols: bool
        :param offset_column: Offset column. This will be added to the combination of columns before applying the link
               function.
               Defaults to ``None``.
        :type offset_column: str, optional
        :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
               to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
               that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
               not increase the size of the data frame. This is typically the number of times a row is repeated, but
               non-integer values are supported as well. During training, rows with higher weights matter more, due to
               the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
               that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
               Defaults to ``None``.
        :type weights_column: str, optional
        :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
               Defaults to ``0.0``.
        :type max_runtime_secs: float
        :param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName`
               Defaults to ``None``.
        :type custom_metric_func: str, optional
        :param score_each_iteration: Whether to score during each iteration of model training.
               Defaults to ``False``.
        :type score_each_iteration: bool
        :param score_iteration_interval: Perform scoring for every score_iteration_interval iterations.
               Defaults to ``5``.
        :type score_iteration_interval: int
        :param seed: Seed for pseudo random number generator (if applicable).
               Defaults to ``-1``.
        :type seed: int
        :param missing_values_handling: Handling of missing values. Either MeanImputation, Skip or PlugValues.
               Defaults to ``"mean_imputation"``.
        :type missing_values_handling: Literal["mean_imputation", "skip", "plug_values"]
        :param plug_values: Plug Values (a single row frame containing values that will be used to impute missing values
               of the training/validation frame, use with conjunction missing_values_handling = PlugValues).
               Defaults to ``None``.
        :type plug_values: Union[None, str, H2OFrame], optional
        :param family: Family. Only gaussian is supported now.
               Defaults to ``"gaussian"``.
        :type family: Literal["gaussian"]
        :param rand_family: Set distribution of random effects.  Only Gaussian is implemented now.
               Defaults to ``None``.
        :type rand_family: Literal["gaussian"], optional
        :param max_iterations: Maximum number of iterations.  Value should >=1.  A value of 0 is only set when only the
               model coefficient names and model coefficient dimensions are needed.
               Defaults to ``-1``.
        :type max_iterations: int
        :param initial_fixed_effects: An array that contains initial values of the fixed effects coefficient.
               Defaults to ``None``.
        :type initial_fixed_effects: List[float], optional
        :param initial_random_effects: A H2OFrame id that contains initial values of the random effects coefficient.
               The row names shouldbe the random coefficient names.  If you are not sure what the random coefficient
               names are, build HGLM model with max_iterations = 0 and checkout the model output field
               random_coefficient_names.  The number of rows of this frame should be the number of level 2 units.
               Again, to figure this out, build HGLM model with max_iterations=0 and check out the model output field
               group_column_names.  The number of rows should equal the length of thegroup_column_names.
               Defaults to ``None``.
        :type initial_random_effects: Union[None, str, H2OFrame], optional
        :param initial_t_matrix: A H2OFrame id that contains initial values of the T matrix.  It should be a positive
               symmetric matrix.
               Defaults to ``None``.
        :type initial_t_matrix: Union[None, str, H2OFrame], optional
        :param tau_u_var_init: Initial variance of random coefficient effects.  If set, should provide a value > 0.0.
               If not set, will be randomly set in the model building process.
               Defaults to ``0.0``.
        :type tau_u_var_init: float
        :param tau_e_var_init: Initial variance of random noise.  If set, should provide a value > 0.0.  If not set,
               will be randomly set in the model building process.
               Defaults to ``0.0``.
        :type tau_e_var_init: float
        :param random_columns: Random columns indices for HGLM.
               Defaults to ``None``.
        :type random_columns: List[str], optional
        :param method: We only implemented EM as a method to obtain the fixed, random coefficients and the various
               variances.
               Defaults to ``"em"``.
        :type method: Literal["em"]
        :param em_epsilon: Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY
               applies to EM method.
               Defaults to ``0.001``.
        :type em_epsilon: float
        :param random_intercept: If true, will allow random component to the GLM coefficients.
               Defaults to ``True``.
        :type random_intercept: bool
        :param group_column: Group column is the column that is categorical and used to generate the groups in HGLM
               Defaults to ``None``.
        :type group_column: str, optional
        :param gen_syn_data: If true, add gaussian noise with variance specified in parms._tau_e_var_init.
               Defaults to ``False``.
        :type gen_syn_data: bool
        """
        super(H2OHGLMEstimator, self).__init__()
        self._parms = {}
        self._id = self._parms['model_id'] = model_id
        self.training_frame = training_frame
        self.validation_frame = validation_frame
        self.response_column = response_column
        self.ignored_columns = ignored_columns
        self.ignore_const_cols = ignore_const_cols
        self.offset_column = offset_column
        self.weights_column = weights_column
        self.max_runtime_secs = max_runtime_secs
        self.custom_metric_func = custom_metric_func
        self.score_each_iteration = score_each_iteration
        self.score_iteration_interval = score_iteration_interval
        self.seed = seed
        self.missing_values_handling = missing_values_handling
        self.plug_values = plug_values
        self.family = family
        self.rand_family = rand_family
        self.max_iterations = max_iterations
        self.initial_fixed_effects = initial_fixed_effects
        self.initial_random_effects = initial_random_effects
        self.initial_t_matrix = initial_t_matrix
        self.tau_u_var_init = tau_u_var_init
        self.tau_e_var_init = tau_e_var_init
        self.random_columns = random_columns
        self.method = method
        self.em_epsilon = em_epsilon
        self.random_intercept = random_intercept
        self.group_column = group_column
        self.gen_syn_data = gen_syn_data

    @property
    def training_frame(self):
        """
        Id of the training data frame.

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("training_frame")

    @training_frame.setter
    def training_frame(self, training_frame):
        self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')

    @property
    def validation_frame(self):
        """
        Id of the validation data frame.

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("validation_frame")

    @validation_frame.setter
    def validation_frame(self, validation_frame):
        self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')

    @property
    def response_column(self):
        """
        Response variable column.

        Type: ``str``.
        """
        return self._parms.get("response_column")

    @response_column.setter
    def response_column(self, response_column):
        assert_is_type(response_column, None, str)
        self._parms["response_column"] = response_column

    @property
    def ignored_columns(self):
        """
        Names of columns to ignore for training.

        Type: ``List[str]``.
        """
        return self._parms.get("ignored_columns")

    @ignored_columns.setter
    def ignored_columns(self, ignored_columns):
        assert_is_type(ignored_columns, None, [str])
        self._parms["ignored_columns"] = ignored_columns

    @property
    def ignore_const_cols(self):
        """
        Ignore constant columns.

        Type: ``bool``, defaults to ``True``.
        """
        return self._parms.get("ignore_const_cols")

    @ignore_const_cols.setter
    def ignore_const_cols(self, ignore_const_cols):
        assert_is_type(ignore_const_cols, None, bool)
        self._parms["ignore_const_cols"] = ignore_const_cols

    @property
    def offset_column(self):
        """
        Offset column. This will be added to the combination of columns before applying the link function.

        Type: ``str``.
        """
        return self._parms.get("offset_column")

    @offset_column.setter
    def offset_column(self, offset_column):
        assert_is_type(offset_column, None, str)
        self._parms["offset_column"] = offset_column

    @property
    def weights_column(self):
        """
        Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
        dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
        weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
        frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
        During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
        weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
        accurate prediction, remove all rows with weight == 0.

        Type: ``str``.
        """
        return self._parms.get("weights_column")

    @weights_column.setter
    def weights_column(self, weights_column):
        assert_is_type(weights_column, None, str)
        self._parms["weights_column"] = weights_column

    @property
    def max_runtime_secs(self):
        """
        Maximum allowed runtime in seconds for model training. Use 0 to disable.

        Type: ``float``, defaults to ``0.0``.
        """
        return self._parms.get("max_runtime_secs")

    @max_runtime_secs.setter
    def max_runtime_secs(self, max_runtime_secs):
        assert_is_type(max_runtime_secs, None, numeric)
        self._parms["max_runtime_secs"] = max_runtime_secs

    @property
    def custom_metric_func(self):
        """
        Reference to custom evaluation function, format: `language:keyName=funcName`

        Type: ``str``.
        """
        return self._parms.get("custom_metric_func")

    @custom_metric_func.setter
    def custom_metric_func(self, custom_metric_func):
        assert_is_type(custom_metric_func, None, str)
        self._parms["custom_metric_func"] = custom_metric_func

    @property
    def score_each_iteration(self):
        """
        Whether to score during each iteration of model training.

        Type: ``bool``, defaults to ``False``.
        """
        return self._parms.get("score_each_iteration")

    @score_each_iteration.setter
    def score_each_iteration(self, score_each_iteration):
        assert_is_type(score_each_iteration, None, bool)
        self._parms["score_each_iteration"] = score_each_iteration

    @property
    def score_iteration_interval(self):
        """
        Perform scoring for every score_iteration_interval iterations.

        Type: ``int``, defaults to ``5``.
        """
        return self._parms.get("score_iteration_interval")

    @score_iteration_interval.setter
    def score_iteration_interval(self, score_iteration_interval):
        assert_is_type(score_iteration_interval, None, int)
        self._parms["score_iteration_interval"] = score_iteration_interval

    @property
    def seed(self):
        """
        Seed for pseudo random number generator (if applicable).

        Type: ``int``, defaults to ``-1``.
        """
        return self._parms.get("seed")

    @seed.setter
    def seed(self, seed):
        assert_is_type(seed, None, int)
        self._parms["seed"] = seed

    @property
    def missing_values_handling(self):
        """
        Handling of missing values. Either MeanImputation, Skip or PlugValues.

        Type: ``Literal["mean_imputation", "skip", "plug_values"]``, defaults to ``"mean_imputation"``.
        """
        return self._parms.get("missing_values_handling")

    @missing_values_handling.setter
    def missing_values_handling(self, missing_values_handling):
        assert_is_type(missing_values_handling, None, Enum("mean_imputation", "skip", "plug_values"))
        self._parms["missing_values_handling"] = missing_values_handling

    @property
    def plug_values(self):
        """
        Plug Values (a single row frame containing values that will be used to impute missing values of the
        training/validation frame, use with conjunction missing_values_handling = PlugValues).

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("plug_values")

    @plug_values.setter
    def plug_values(self, plug_values):
        self._parms["plug_values"] = H2OFrame._validate(plug_values, 'plug_values')

    @property
    def family(self):
        """
        Family. Only gaussian is supported now.

        Type: ``Literal["gaussian"]``, defaults to ``"gaussian"``.
        """
        return self._parms.get("family")

    @family.setter
    def family(self, family):
        assert_is_type(family, None, Enum("gaussian"))
        self._parms["family"] = family

    @property
    def rand_family(self):
        """
        Set distribution of random effects.  Only Gaussian is implemented now.

        Type: ``Literal["gaussian"]``.
        """
        return self._parms.get("rand_family")

    @rand_family.setter
    def rand_family(self, rand_family):
        assert_is_type(rand_family, None, Enum("gaussian"))
        self._parms["rand_family"] = rand_family

    @property
    def max_iterations(self):
        """
        Maximum number of iterations.  Value should >=1.  A value of 0 is only set when only the model coefficient names
        and model coefficient dimensions are needed.

        Type: ``int``, defaults to ``-1``.
        """
        return self._parms.get("max_iterations")

    @max_iterations.setter
    def max_iterations(self, max_iterations):
        assert_is_type(max_iterations, None, int)
        self._parms["max_iterations"] = max_iterations

    @property
    def initial_fixed_effects(self):
        """
        An array that contains initial values of the fixed effects coefficient.

        Type: ``List[float]``.
        """
        return self._parms.get("initial_fixed_effects")

    @initial_fixed_effects.setter
    def initial_fixed_effects(self, initial_fixed_effects):
        assert_is_type(initial_fixed_effects, None, [numeric])
        self._parms["initial_fixed_effects"] = initial_fixed_effects

    @property
    def initial_random_effects(self):
        """
        A H2OFrame id that contains initial values of the random effects coefficient.  The row names shouldbe the random
        coefficient names.  If you are not sure what the random coefficient names are, build HGLM model with
        max_iterations = 0 and checkout the model output field random_coefficient_names.  The number of rows of this
        frame should be the number of level 2 units.  Again, to figure this out, build HGLM model with max_iterations=0
        and check out the model output field group_column_names.  The number of rows should equal the length of
        thegroup_column_names.

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("initial_random_effects")

    @initial_random_effects.setter
    def initial_random_effects(self, initial_random_effects):
        self._parms["initial_random_effects"] = H2OFrame._validate(initial_random_effects, 'initial_random_effects')

    @property
    def initial_t_matrix(self):
        """
        A H2OFrame id that contains initial values of the T matrix.  It should be a positive symmetric matrix.

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("initial_t_matrix")

    @initial_t_matrix.setter
    def initial_t_matrix(self, initial_t_matrix):
        self._parms["initial_t_matrix"] = H2OFrame._validate(initial_t_matrix, 'initial_t_matrix')

    @property
    def tau_u_var_init(self):
        """
        Initial variance of random coefficient effects.  If set, should provide a value > 0.0.  If not set, will be
        randomly set in the model building process.

        Type: ``float``, defaults to ``0.0``.
        """
        return self._parms.get("tau_u_var_init")

    @tau_u_var_init.setter
    def tau_u_var_init(self, tau_u_var_init):
        assert_is_type(tau_u_var_init, None, numeric)
        self._parms["tau_u_var_init"] = tau_u_var_init

    @property
    def tau_e_var_init(self):
        """
        Initial variance of random noise.  If set, should provide a value > 0.0.  If not set, will be randomly set in
        the model building process.

        Type: ``float``, defaults to ``0.0``.
        """
        return self._parms.get("tau_e_var_init")

    @tau_e_var_init.setter
    def tau_e_var_init(self, tau_e_var_init):
        assert_is_type(tau_e_var_init, None, numeric)
        self._parms["tau_e_var_init"] = tau_e_var_init

    @property
    def random_columns(self):
        """
        Random columns indices for HGLM.

        Type: ``List[str]``.

        :examples:

        >>> import h2o
        >>> from h2o.estimators import H2OHGLMEstimator
        >>> h2o.init()
        >>> prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
        >>> prostate <- h2o.uploadFile(path = prostate_path)
        >>> prostate$CAPSULE <- as.factor(prostate$CAPSULE)
        >>> hglm_model =H2OHGLMEstimator(random_columns = ["AGE"], group_column = "RACE")
        >>> hglm_model.train(x=c("AGE","RACE","DPROS"), y="CAPSULE", training_frame=prostate)
        """
        return self._parms.get("random_columns")

    @random_columns.setter
    def random_columns(self, random_columns):
        assert_is_type(random_columns, None, [str])
        self._parms["random_columns"] = random_columns

    @property
    def method(self):
        """
        We only implemented EM as a method to obtain the fixed, random coefficients and the various variances.

        Type: ``Literal["em"]``, defaults to ``"em"``.
        """
        return self._parms.get("method")

    @method.setter
    def method(self, method):
        assert_is_type(method, None, Enum("em"))
        self._parms["method"] = method

    @property
    def em_epsilon(self):
        """
        Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY applies to EM
        method.

        Type: ``float``, defaults to ``0.001``.
        """
        return self._parms.get("em_epsilon")

    @em_epsilon.setter
    def em_epsilon(self, em_epsilon):
        assert_is_type(em_epsilon, None, numeric)
        self._parms["em_epsilon"] = em_epsilon

    @property
    def random_intercept(self):
        """
        If true, will allow random component to the GLM coefficients.

        Type: ``bool``, defaults to ``True``.
        """
        return self._parms.get("random_intercept")

    @random_intercept.setter
    def random_intercept(self, random_intercept):
        assert_is_type(random_intercept, None, bool)
        self._parms["random_intercept"] = random_intercept

    @property
    def group_column(self):
        """
        Group column is the column that is categorical and used to generate the groups in HGLM

        Type: ``str``.
        """
        return self._parms.get("group_column")

    @group_column.setter
    def group_column(self, group_column):
        assert_is_type(group_column, None, str)
        self._parms["group_column"] = group_column

    @property
    def gen_syn_data(self):
        """
        If true, add gaussian noise with variance specified in parms._tau_e_var_init.

        Type: ``bool``, defaults to ``False``.
        """
        return self._parms.get("gen_syn_data")

    @gen_syn_data.setter
    def gen_syn_data(self, gen_syn_data):
        assert_is_type(gen_syn_data, None, bool)
        self._parms["gen_syn_data"] = gen_syn_data


[docs]    def level_2_names(self):
        """
        Get the level 2 column values.
        """
        return self._model_json["output"]["group_column_names"]

[docs]    def coefs_random_names(self):
        """
        Get the random effect coefficient names including the intercept if applicable.
        """
        return self._model_json["output"]["random_coefficient_names"]

[docs]    def coefs_random(self):
        """
        Get the random coefficients of the model.
        """
        level_2_names = self.level_2_names()
        random_coefs = self._model_json["output"]["ubeta"]
        return dict(zip(level_2_names, random_coefs))

[docs]    def scoring_history_valid(self, as_data_frame=True):
        """
        Retrieve Model Score History for validation data frame if present

        :returns: The validation score history as an H2OTwoDimTable or a Pandas DataFrame.
        """
        model = self._model_json["output"]
        if "scoring_history_valid" in model and model["scoring_history_valid"] is not None:
            if as_data_frame:
                return model["scoring_history_valid"].as_data_frame()
            else:
                return model["scoring_history_valid"]
        print("No validation scoring history for this model")

[docs]    def matrix_T(self):
        """
        retrieve the T matrix estimated for the random effects. The T matrix is the Tj matrix described in 
        section II.I of the doc.

        :return: The T matrix as a tuple of tuples.
        """
        model = self._model_json["output"]
        return model["tmat"]

[docs]    def residual_variance(self):
        """
        retrieve the residual variance estimate from the model building process.

        :return: residual variance estiamte as a double
        """
        model = self._model_json["output"]
        return model["residual_variance"]

[docs]    def icc(self):
        """
        retrieve the icc from the model building process.

        :return: icc as an array
        """
        model = self._model_json["output"]
        return model["icc"]

[docs]    def mean_residual_fixed(self, train = True):
        """
        retrieve the mean residual error using the fixed effect coefficients only.

        :param train: boolean, if true return result from training frame, else return result from validation frame.
        :return: mean residual error as a double.
        """
        model = self._model_json["output"]
        if train:
            return model["mean_residual_fixed"]
        else:
            return model["mean_residual_fixed_valid"]