Source code for h2o.model.model_base

"""
This module implements the base model class.  All model things inherit from this class.
"""
from __future__ import print_function
from builtins import zip
from builtins import str
from builtins import range
from builtins import object

import h2o
import imp, traceback, warnings
from ..utils.shared_utils import can_use_pandas
from ..h2o import H2OJob


[docs]class ModelBase(object): def __init__(self): self._id = None self._model_json = None self._metrics_class = None self._is_xvalidated = False self._xval_keys = None self._parms = {} # internal, for object recycle self.parms = {} # external self._estimator_type = "unsupervised" self._future = False # used by __repr__/show to query job state self._job = None # used when _future is True @property def model_id(self): """ :return: Retrieve this model's identifier. """ return self._id @model_id.setter
[docs] def model_id(self, value): oldname = self.model_id self._id = value h2o.rapids("(rename \"{}\" \"{}\")".format(oldname, value))
@property
[docs] def params(self): """ Get the parameters and the actual/default values only. :return: A dictionary of parameters used to build this model. """ params = {} for p in self.parms: params[p] = {"default":self.parms[p]["default_value"], "actual":self.parms[p]["actual_value"]} return params
@property
[docs] def full_parameters(self): """ Get the full specification of all parameters. :return: a dictionary of parameters used to build this model. """ return self.parms
@property
[docs] def type(self): """Get the type of model built as a string. Returns ------- "classifier" or "regressor" or "unsupervised" """ return self._estimator_type
def __repr__(self): # PUBDEV-2278: using <method>? from IPython caused everything to dump stk = traceback.extract_stack() if not ("IPython" in stk[-2][0] and "info" == stk[-2][2]): self.show() return ""
[docs] def predict_leaf_node_assignment(self, test_data): """ Predict on a dataset and return the leaf node assignment (only for tree-based models) Parameters ---------- test_data: H2OFrame Data on which to make predictions. Returns ------- A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, leaf_node_assignment=True) return h2o.get_frame(j["predictions_frame"]["name"])
[docs] def predict(self, test_data): """ Predict on a dataset. Parameters ---------- test_data: H2OFrame Data on which to make predictions. Returns ------- A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, _rest_version=4), self._model_json['algo'] + " prediction") j.poll() return h2o.get_frame(j.dest_key)
[docs] def is_cross_validated(self): """ :return: True if the model was cross-validated. """ return self._is_xvalidated
[docs] def xval_keys(self): """ :return: The model keys for the cross-validated model. """ return self._xval_keys
[docs] def get_xval_models(self,key=None): """ Return a Model object. :param key: If None, return all cross-validated models; otherwise return the model that key points to. :return: A model or list of models. """ return h2o.get_model(key) if key is not None else [h2o.get_model(k) for k in self._xval_keys]
@property
[docs] def xvals(self): """ Return a list of the cross-validated models. :return: A list of models """ return self.get_xval_models()
[docs] def deepfeatures(self, test_data, layer): """ Return hidden layer details :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if test_data is None: raise ValueError("Must specify test data") j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data.frame_id, deep_features_hidden_layer=layer, _rest_version=4), "deepfeatures") j.poll() return h2o.get_frame(j.dest_key)
[docs] def weights(self, matrix_id=0): """ Return the frame for the respective weight matrix :param: matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return. :return: an H2OFrame which represents the weight matrix identified by matrix_id """ num_weight_matrices = len(self._model_json['output']['weights']) if matrix_id not in list(range(num_weight_matrices)): raise ValueError("Weight matrix does not exist. Model has {0} weight matrices (0-based indexing), but matrix {1} " "was requested.".format(num_weight_matrices, matrix_id)) return h2o.get_frame(self._model_json['output']['weights'][matrix_id]['URL'].split('/')[3])
[docs] def biases(self, vector_id=0): """ Return the frame for the respective bias vector :param: vector_id: an integer, ranging from 0 to number of layers, that specifies the bias vector to return. :return: an H2OFrame which represents the bias vector identified by vector_id """ num_bias_vectors = len(self._model_json['output']['biases']) if vector_id not in list(range(num_bias_vectors)): raise ValueError("Bias vector does not exist. Model has {0} bias vectors (0-based indexing), but vector {1} " "was requested.".format(num_bias_vectors, vector_id)) return h2o.get_frame(self._model_json['output']['biases'][vector_id]['URL'].split('/')[3])
[docs] def normmul(self): """ Normalization/Standardization multipliers for numeric predictors """ return self._model_json['output']['normmul']
[docs] def normsub(self): """ Normalization/Standardization offsets for numeric predictors """ return self._model_json['output']['normsub']
[docs] def respmul(self): """ Normalization/Standardization multipliers for numeric response """ return self._model_json['output']['normrespmul']
[docs] def respsub(self): """ Normalization/Standardization offsets for numeric response """ return self._model_json['output']['normrespsub']
[docs] def catoffsets(self): """ Categorical offsets for one-hot encoding """ return self._model_json['output']['catoffsets']
[docs] def model_performance(self, test_data=None, train=False, valid=False): """ Generate model metrics for this model on test_data. Parameters ---------- test_data: H2OFrame, optional Data set for which model metrics shall be computed against. Both train and valid arguments are ignored if test_data is not None. train: boolean, optional Report the training metrics for the model. If the test_data is the training data, the training metrics are returned. valid: boolean, optional Report the validation metrics for the model. If train and valid are True, then it defaults to True. Returns ------- An object of class H2OModelMetrics. """ if test_data is None: if not train and not valid: train = True # default to train if train: return self._model_json["output"]["training_metrics"] if valid: return self._model_json["output"]["validation_metrics"] else: # cases dealing with test_data not None if not isinstance(test_data, h2o.H2OFrame): raise ValueError("`test_data` must be of type H2OFrame. Got: " + type(test_data)) res = h2o.H2OConnection.post_json("ModelMetrics/models/" + self.model_id + "/frames/" + test_data.frame_id) # FIXME need to do the client-side filtering... PUBDEV-874: https://0xdata.atlassian.net/browse/PUBDEV-874 raw_metrics = None for mm in res["model_metrics"]: if not mm["frame"] == None and mm["frame"]["name"] == test_data.frame_id: raw_metrics = mm break return self._metrics_class(raw_metrics,algo=self._model_json["algo"])
[docs] def scoring_history(self): """ Retrieve Model Score History Returns ------- The score history as an H2OTwoDimTable or a Pandas DataFrame. """ model = self._model_json["output"] if 'scoring_history' in list(model.keys()) and model["scoring_history"] is not None: return model["scoring_history"].as_data_frame() print("No score history for this model")
[docs] def score_history(self): """ Deprecated for scoring_history """ warnings.warn("`score_history` is deprecated. Use `scoring_history`", category=DeprecationWarning, stacklevel=2) return self.scoring_history()
[docs] def summary(self): """ Print a detailed summary of the model. """ model = self._model_json["output"] if model["model_summary"]: model["model_summary"].show() # H2OTwoDimTable object
[docs] def show(self): """ Print innards of model, without regards to type """ if self._future: self._job.poll_once() return if self._model_json is None: print("No model trained yet") return if self.model_id is None: print("This H2OEstimator has been removed.") return model = self._model_json["output"] print("Model Details") print("=============") print(self.__class__.__name__, ": ", self._model_json["algo_full_name"]) print("Model Key: ", self._id) self.summary() print() # training metrics tm = model["training_metrics"] if tm: tm.show() vm = model["validation_metrics"] if vm: vm.show() xm = model["cross_validation_metrics"] if xm: xm.show() if "scoring_history" in list(model.keys()) and model["scoring_history"]: model["scoring_history"].show() if "variable_importances" in list(model.keys()) and model["variable_importances"]: model["variable_importances"].show()
[docs] def varimp(self, use_pandas=False): """ Pretty print the variable importances, or return them in a list Parameters ---------- use_pandas: boolean, optional If True, then the variable importances will be returned as a pandas data frame. Returns ------- A list or Pandas DataFrame. """ model = self._model_json["output"] if "variable_importances" in list(model.keys()) and model["variable_importances"]: vals = model["variable_importances"].cell_values header=model["variable_importances"].col_header if use_pandas and can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print("Warning: This model doesn't have variable importances")
[docs] def residual_deviance(self,train=False,valid=False,xval=False): """ Retreive the residual deviance if this model has the attribute, or None otherwise. :param train: Get the residual deviance for the training set. If both train and valid are False, then train is selected by default. :param valid: Get the residual deviance for the validation set. If both train and valid are True, then train is selected by default. :return: Return the residual deviance, or None if it is not present. """ if xval: raise ValueError("Cross-validation metrics are not available.") if not train and not valid: train = True if train and valid: train = True return self._model_json["output"]["training_metrics"].residual_deviance() if train else self._model_json["output"]["validation_metrics"].residual_deviance()
[docs] def residual_degrees_of_freedom(self,train=False,valid=False,xval=False): """ Retreive the residual degress of freedom if this model has the attribute, or None otherwise. :param train: Get the residual dof for the training set. If both train and valid are False, then train is selected by default. :param valid: Get the residual dof for the validation set. If both train and valid are True, then train is selected by default. :return: Return the residual dof, or None if it is not present. """ if xval: raise ValueError("Cross-validation metrics are not available.") if not train and not valid: train = True if train and valid: train = True return self._model_json["output"]["training_metrics"].residual_degrees_of_freedom() if train else self._model_json["output"]["validation_metrics"].residual_degrees_of_freedom()
[docs] def null_deviance(self,train=False,valid=False,xval=False): """ Retreive the null deviance if this model has the attribute, or None otherwise. :param: train Get the null deviance for the training set. If both train and valid are False, then train is selected by default. :param: valid Get the null deviance for the validation set. If both train and valid are True, then train is selected by default. :return: Return the null deviance, or None if it is not present. """ if xval: raise ValueError("Cross-validation metrics are not available.") if not train and not valid: train = True if train and valid: train = True return self._model_json["output"]["training_metrics"].null_deviance() if train else self._model_json["output"]["validation_metrics"].null_deviance()
[docs] def null_degrees_of_freedom(self,train=False,valid=False,xval=False): """ Retreive the null degress of freedom if this model has the attribute, or None otherwise. :param train: Get the null dof for the training set. If both train and valid are False, then train is selected by default. :param valid: Get the null dof for the validation set. If both train and valid are True, then train is selected by default. :return: Return the null dof, or None if it is not present. """ if xval: raise ValueError("Cross-validation metrics are not available.") if not train and not valid: train = True if train and valid: train = True return self._model_json["output"]["training_metrics"].null_degrees_of_freedom() if train else self._model_json["output"]["validation_metrics"].null_degrees_of_freedom()
[docs] def pprint_coef(self): """ Pretty print the coefficents table (includes normalized coefficients) """ print(self._model_json["output"]["coefficients_table"]) # will return None if no coefs!
[docs] def coef(self): """ :return: Return the coefficients for this model. """ tbl = self._model_json["output"]["coefficients_table"] if tbl is None: return None tbl = tbl.cell_values return {a[0]:a[1] for a in tbl}
[docs] def coef_norm(self): """ :return: Return the normalized coefficients """ tbl = self._model_json["output"]["coefficients_table"] if tbl is None: return None tbl = tbl.cell_values return {a[0]:a[2] for a in tbl}
[docs] def r2(self, train=False, valid=False, xval=False): """ Return the R^2 for this regression model. The R^2 value is defined to be 1 - MSE/var, where var is computed as sigma*sigma. If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are "train", "valid", and "xval" :param train: If train is True, then return the R^2 value for the training data. :param valid: If valid is True, then return the R^2 value for the validation data. :param xval: If xval is True, then return the R^2 value for the cross validation data. :return: The R^2 for this regression model. """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k,v in zip(list(tm.keys()),list(tm.values())): m[k] = None if v is None else v.r2() return list(m.values())[0] if len(m) == 1 else m
[docs] def mse(self, train=False, valid=False, xval=False): """ Get the MSE(s). If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are "train", "valid", and "xval" Parameters ---------- train : bool, default=True If train is True, then return the MSE value for the training data. valid : bool, default=True If valid is True, then return the MSE value for the validation data. xval : bool, default=True If xval is True, then return the MSE value for the cross validation data. Returns ------- The MSE for this regression model. """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k,v in zip(list(tm.keys()),list(tm.values())): m[k] = None if v is None else v.mse() return list(m.values())[0] if len(m) == 1 else m
[docs] def logloss(self, train=False, valid=False, xval=False): """ Get the Log Loss(s). If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are "train", "valid", and "xval" :param train: If train is True, then return the Log Loss value for the training data. :param valid: If valid is True, then return the Log Loss value for the validation data. :param xval: If xval is True, then return the Log Loss value for the cross validation data. :return: The Log Loss for this binomial model. """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k,v in zip(list(tm.keys()),list(tm.values())): m[k] = None if v is None else v.logloss() return list(m.values())[0] if len(m) == 1 else m
[docs] def mean_residual_deviance(self, train=False, valid=False, xval=False): """ Get the Mean Residual Deviances(s). If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are "train", "valid", and "xval" :param train: If train is True, then return the Mean Residual Deviance value for the training data. :param valid: If valid is True, then return the Mean Residual Deviance value for the validation data. :param xval: If xval is True, then return the Mean Residual Deviance value for the cross validation data. :return: The Mean Residual Deviance for this regression model. """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k,v in zip(list(tm.keys()),list(tm.values())): m[k] = None if v is None else v.mean_residual_deviance() return list(m.values())[0] if len(m) == 1 else m
[docs] def auc(self, train=False, valid=False, xval=False): """ Get the AUC(s). If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are "train", "valid", and "xval" :param train: If train is True, then return the AUC value for the training data. :param valid: If valid is True, then return the AUC value for the validation data. :param xval: If xval is True, then return the AUC value for the validation data. :return: The AUC. """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k,v in zip(list(tm.keys()),list(tm.values())): m[k] = None if v is None else v.auc() return list(m.values())[0] if len(m) == 1 else m
[docs] def aic(self, train=False, valid=False, xval=False): """ Get the AIC(s). If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are "train", "valid", and "xval" :param train: If train is True, then return the AIC value for the training data. :param valid: If valid is True, then return the AIC value for the validation data. :param xval: If xval is True, then return the AIC value for the validation data. :return: The AIC. """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k,v in zip(list(tm.keys()),list(tm.values())): m[k] = None if v is None else v.aic() return list(m.values())[0] if len(m) == 1 else m
[docs] def giniCoef(self, train=False, valid=False, xval=False): """ Get the Gini Coefficient(s). If all are False (default), then return the training metric value. If more than one options is set to True, then return a dictionary of metrics where the keys are "train", "valid", and "xval" :param train: If train is True, then return the Gini Coefficient value for the training data. :param valid: If valid is True, then return the Gini Coefficient value for the validation data. :param xval: If xval is True, then return the Gini Coefficient value for the cross validation data. :return: The Gini Coefficient for this binomial model. """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k,v in zip(list(tm.keys()),list(tm.values())): m[k] = None if v is None else v.giniCoef() return list(m.values())[0] if len(m) == 1 else m
[docs] def download_pojo(self,path=""): """ Download the POJO for this model to the directory specified by path (no trailing slash!). If path is "", then dump to screen. :param model: Retrieve this model's scoring POJO. :param path: An absolute path to the directory where POJO should be saved. :return: None """ h2o.download_pojo(self,path) # call the "package" function
@staticmethod def _get_metrics(o, train, valid, xval): metrics = {} if train: metrics["train"] = o._model_json["output"]["training_metrics"] if valid: metrics["valid"] = o._model_json["output"]["validation_metrics"] if xval : metrics["xval"] = o._model_json["output"]["cross_validation_metrics"] if len(metrics) == 0: metrics["train"] = o._model_json["output"]["training_metrics"] return metrics # Delete from cluster as model goes out of scope # def __del__(self): # h2o.remove(self._id) def _plot(self, timestep, metric, **kwargs): # check for matplotlib. exit if absent try: imp.find_module('matplotlib') import matplotlib if 'server' in list(kwargs.keys()) and kwargs['server']: matplotlib.use('Agg', warn=False) import matplotlib.pyplot as plt except ImportError: print("matplotlib is required for this function!") return scoring_history = self.scoring_history() # Separate functionality for GLM since its output is different from other algos if self._model_json["algo"] == "glm": # GLM has only one timestep option, which is `iteration` timestep = "iteration" if metric == "AUTO": metric = "log_likelihood" elif metric not in ("log_likelihood", "objective"): raise ValueError("for GLM, metric must be one of: log_likelihood, objective") plt.xlabel(timestep) plt.ylabel(metric) plt.title("Validation Scoring History") plt.plot(scoring_history[timestep], scoring_history[metric]) elif self._model_json["algo"] in ("deeplearning", "drf", "gbm"): # Set timestep if self._model_json["algo"] in ("gbm", "drf"): if timestep == "AUTO": timestep = "number_of_trees" elif timestep not in ("duration","number_of_trees"): raise ValueError("timestep for gbm or drf must be one of: duration, number_of_trees") else: #self._model_json["algo"] == "deeplearning": # Delete first row of DL scoring history since it contains NAs & NaNs if scoring_history["samples"][0] == 0: scoring_history = scoring_history[1:] if timestep == "AUTO": timestep = "epochs" elif timestep not in ("epochs","samples","duration"): raise ValueError("timestep for deeplearning must be one of: epochs, samples, duration") training_metric = "training_{}".format(metric) validation_metric = "validation_{}".format(metric) if timestep == "duration": dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1]) scoring_history[dur_colname] = [str(x).split()[0] for x in scoring_history["duration"]] timestep = dur_colname if can_use_pandas(): valid = validation_metric in list(scoring_history) ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \ else (scoring_history[training_metric].min(), scoring_history[training_metric].max()) else: valid = validation_metric in scoring_history.col_header ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \ else (min(scoring_history[training_metric]), max(scoring_history[training_metric])) if ylim[0] == ylim[1]: ylim = (0,1) if valid: # Training and validation scoring history plt.xlabel(timestep) plt.ylabel(metric) plt.title("Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training") plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange", label="Validation") plt.legend() else: # Training scoring history only plt.xlabel(timestep) plt.ylabel(training_metric) plt.title("Training Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric]) else: # algo is not glm, deeplearning, drf, gbm raise ValueError("Plotting not implemented for this type of model") if "server" not in list(kwargs.keys()) or not kwargs["server"]: plt.show() @staticmethod def _check_targets(y_actual, y_predicted): """Check that y_actual and y_predicted have the same length. :param y_actual: An H2OFrame :param y_predicted: An H2OFrame :return: None """ if len(y_actual) != len(y_predicted): raise ValueError("Row mismatch: [{},{}]".format(len(y_actual),len(y_predicted)))