Source code for h2o.model.model_base

"""
This module implements the base model class.  All model things inherit from this class.
"""

import h2o
from . import H2OFrame
from . import H2OVec
from . import H2OTwoDimTable
from . import H2OConnection


[docs]class ModelBase(object): def __init__(self, dest_key, model_json, metrics_class): self._key = dest_key # setup training metrics if "training_metrics" in model_json["output"]: tm = model_json["output"]["training_metrics"] tm = metrics_class(tm,True,False,model_json["algo"]) model_json["output"]["training_metrics"] = tm # setup validation metrics if "validation_metrics" in model_json["output"]: vm = model_json["output"]["validation_metrics"] if vm is None: model_json["output"]["validation_metrics"] = None else: vm = metrics_class(vm,False,True,model_json["algo"]) model_json["output"]["validation_metrics"] = vm else: model_json["output"]["validation_metrics"] = None self._model_json = model_json self._metrics_class = metrics_class def __repr__(self): self.show() return ""
[docs] def predict(self, test_data): """ Predict on a dataset. :param test_data: Data to be predicted on. :return: A new H2OFrame filled with predictions. """ if not test_data: raise ValueError("Must specify test data") # cbind the test_data vecs together and produce a temp key test_data_key = H2OFrame.send_frame(test_data) # get the predictions # this job call is blocking j = H2OConnection.post_json("Predictions/models/" + self._key + "/frames/" + test_data_key) # toast the cbound frame h2o.removeFrameShallow(test_data_key) # retrieve the prediction frame prediction_frame_key = j["model_metrics"][0]["predictions"]["frame_id"]["name"] # get the actual frame meta dta pred_frame_meta = h2o.frame(prediction_frame_key)["frames"][0] # toast the prediction frame h2o.removeFrameShallow(prediction_frame_key) # collect the vec_ids vec_ids = pred_frame_meta["vec_ids"] # get the number of rows rows = pred_frame_meta["rows"] # get the column names cols = [col["label"] for col in pred_frame_meta["columns"]] # create a set of H2OVec objects vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows) # return a new H2OFrame object return H2OFrame(vecs=vecs)
[docs] def deepfeatures(self, test_data, layer): """ Return hidden layer details :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if not test_data: raise ValueError("Must specify test data") # create test_data by cbinding vecs test_data_key = H2OFrame.send_frame(test_data) # get the deepfeatures of the dataset j = H2OConnection.post_json("Predictions/models/" + self._key + "/frames/" + test_data_key, deep_features_hidden_layer=layer) # retreive the frame data deepfeatures_frame_key = j["predictions_frame"]["name"] df_frame_meta = h2o.frame(deepfeatures_frame_key)["frames"][0] # create vecs by extracting vec_ids, col length, and col names vec_ids = df_frame_meta["vec_ids"] rows = df_frame_meta["rows"] cols = [col["label"] for col in df_frame_meta["columns"]] vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows) # remove test data from kv h2o.removeFrameShallow(test_data_key) # finally return frame return H2OFrame(vecs=vecs)
[docs] def weights(self, matrix_id=0): """ Return the frame for the respective weight matrix :param: matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return. :return: an H2OFrame which represents the weight matrix identified by matrix_id """ num_weight_matrices = len(self._model_json['output']['weights']) if matrix_id not in range(num_weight_matrices): raise ValueError("Weight matrix does not exist. Model has {0} weight matrices (0-based indexing), but matrix {1} " "was requested.".format(num_weight_matrices, matrix_id)) j = h2o.frame(self._model_json['output']['weights'][matrix_id]['URL'].split('/')[3]) fr = j['frames'][0] rows = fr['rows'] vec_ids = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, vec_ids), rows)) return result
[docs] def biases(self, vector_id=0): """ Return the frame for the respective bias vector :param: vector_id: an integer, ranging from 0 to number of layers, that specifies the bias vector to return. :return: an H2OFrame which represents the bias vector identified by vector_id """ num_bias_vectors = len(self._model_json['output']['biases']) if vector_id not in range(num_bias_vectors): raise ValueError("Bias vector does not exist. Model has {0} bias vectors (0-based indexing), but vector {1} " "was requested.".format(num_bias_vectors, vector_id)) j = h2o.frame(self._model_json['output']['biases'][vector_id]['URL'].split('/')[3]) fr = j['frames'][0] rows = fr['rows'] vec_ids = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, vec_ids), rows)) return result
[docs] def model_performance(self, test_data=None, train=False, valid=False): """ Generate model metrics for this model on test_data. :param test_data: Data set for which model metrics shall be computed against. Both train and valid arguments are ignored if test_data is not None. :param train: Report the training metrics for the model. If the test_data is the training data, the training metrics are returned. :param valid: Report the validation metrics for the model. If train and valid are True, then it defaults to True. :return: An object of class H2OModelMetrics. """ if test_data is None: if not train and not valid: train = True # default to train if train: return self._model_json["output"]["training_metrics"] if valid: return self._model_json["output"]["validation_metrics"] else: # cases dealing with test_data not None if not isinstance(test_data, H2OFrame): raise ValueError("`test_data` must be of type H2OFrame. Got: " + type(test_data)) fr_key = H2OFrame.send_frame(test_data) res = H2OConnection.post_json("ModelMetrics/models/" + self._key + "/frames/" + fr_key) h2o.removeFrameShallow(fr_key) # FIXME need to do the client-side filtering... PUBDEV-874: https://0xdata.atlassian.net/browse/PUBDEV-874 raw_metrics = None for mm in res["model_metrics"]: if mm["frame"]["name"] == fr_key: raw_metrics = mm break return self._metrics_class(raw_metrics,algo=self._model_json["algo"])
[docs] def score_history(self): """ Retrieve Model Score History :return: the score history (H2OTwoDimTable) """ model = self._model_json["output"] if 'scoring_history' in model.keys() and model["scoring_history"] != None: return model["scoring_history"] else: print "No score history for this model"
[docs] def summary(self): """ Print a detailed summary of the model. :return: """ model = self._model_json["output"] if model["model_summary"]: model["model_summary"].show() # H2OTwoDimTable object
[docs] def show(self): """ Print innards of model, without regards to type :return: None """ model = self._model_json["output"] print "Model Details" print "=============" print self.__class__.__name__, ": ", self._model_json["algo_full_name"] print "Model Key: ", self._key self.summary() print # training metrics tm = model["training_metrics"] if tm: tm.show() vm = model["validation_metrics"] if vm: vm.show() if "scoring_history" in model.keys() and model["scoring_history"]: model["scoring_history"].show() if "variable_importances" in model.keys() and model["variable_importances"]: model["variable_importances"].show()
[docs] def varimp(self, return_list=False): """ Pretty print the variable importances, or return them in a list :param return_list: if True, then return the variable importances in an list (ordered from most important to least important). Each entry in the list is a 4-tuple of (variable, relative_importance, scaled_importance, percentage). :return: None or ordered list """ model = self._model_json["output"] if "variable_importances" in model.keys() and model["variable_importances"]: if not return_list: return model["variable_importances"].show() else: return model["variable_importances"].cell_values else: print "Warning: This model doesn't have variable importances"
[docs] def residual_deviance(self,train=False,valid=False): """ Retreive the residual deviance if this model has the attribute, or None otherwise. :param train: Get the residual deviance for the training set. If both train and valid are False, then train is selected by default. :param valid: Get the residual deviance for the validation set. If both train and valid are True, then train is selected by default. :return: Return the residual deviance, or None if it is not present. """ if not train and not valid: train = True if train and valid: train = True if train: return self._model_json["output"]["training_metrics"].residual_deviance() else: return self._model_json["output"]["validation_metrics"].residual_deviance()
[docs] def residual_degrees_of_freedom(self,train=False,valid=False): """ Retreive the residual degress of freedom if this model has the attribute, or None otherwise. :param train: Get the residual dof for the training set. If both train and valid are False, then train is selected by default. :param valid: Get the residual dof for the validation set. If both train and valid are True, then train is selected by default. :return: Return the residual dof, or None if it is not present. """ if not train and not valid: train = True if train and valid: train = True if train: return self._model_json["output"]["training_metrics"].residual_degrees_of_freedom() else: return self._model_json["output"]["validation_metrics"].residual_degrees_of_freedom()
[docs] def null_deviance(self,train=False,valid=False): """ Retreive the null deviance if this model has the attribute, or None otherwise. :param: train Get the null deviance for the training set. If both train and valid are False, then train is selected by default. :param: valid Get the null deviance for the validation set. If both train and valid are True, then train is selected by default. :return: Return the null deviance, or None if it is not present. """ if not train and not valid: train = True if train and valid: train = True if train: return self._model_json["output"]["training_metrics"].null_deviance() else: return self._model_json["output"]["validation_metrics"].null_deviance()
[docs] def null_degrees_of_freedom(self,train=False,valid=False): """ Retreive the null degress of freedom if this model has the attribute, or None otherwise. :param train: Get the null dof for the training set. If both train and valid are False, then train is selected by default. :param valid: Get the null dof for the validation set. If both train and valid are True, then train is selected by default. :return: Return the null dof, or None if it is not present. """ if not train and not valid: train = True if train and valid: train = True if train: return self._model_json["output"]["training_metrics"].null_degrees_of_freedom() else: return self._model_json["output"]["validation_metrics"].null_degrees_of_freedom()
[docs] def pprint_coef(self): """ Pretty print the coefficents table (includes normalized coefficients) :return: None """ print self._model_json["output"]["coefficients_table"] # will return None if no coefs!
[docs] def coef(self): """ :return: Return the coefficients for this model. """ tbl = self._model_json["output"]["coefficients_table"] if tbl is None: return None tbl = tbl.cell_values return {a[0]:a[1] for a in tbl}
[docs] def coef_norm(self): """ :return: Return the normalized coefficients """ tbl = self._model_json["output"]["coefficients_table"] if tbl is None: return None tbl = tbl.cell_values return {a[0]:a[2] for a in tbl}
[docs] def r2(self, train=False, valid=False): """ Return the R^2 for this regression model. The R^2 value is defined to be 1 - MSE/var, where var is computed as sigma*sigma. :param train: If train is True, then return the R^2 value for the training data. If train and valid are both False, then return the training R^2. :param valid: If valid is True, then return the R^2 value for the validation data. If train and valid are both True, then return the validation R^2. :return: The R^2 for this regression model. """ tm = ModelBase._get_metrics(self, *ModelBase._train_or_valid(train,valid)) if tm is None: return None return tm.r2()
[docs] def mse(self, train=False,valid=False): """ :param train: If train is True, then return the MSE value for the training data. If train and valid are both False, then return the training MSE. :param valid: If valid is True, then return the MSE value for the validation data. If train and valid are both True, then return the validation MSE. :return: The MSE for this regression model. """ tm = ModelBase._get_metrics(self, *ModelBase._train_or_valid(train,valid)) if tm is None: return None return tm.mse()
[docs] def logloss(self, train=False, valid=False): """ Get the Log Loss. If both train and valid are False, return the train. If both train and valid are True, return the valid. :param train: Return the log loss for training data. :param valid: Return the log loss for the validation data. :return: Retrieve the log loss coefficient for this set of metrics """ tm = ModelBase._get_metrics(self,*ModelBase._train_or_valid(train, valid)) if tm is None: return None return tm.logloss()
[docs] def auc(self, train=False, valid=False): """ Get the AUC. If both train and valid are False, return the train. If both train and valid are True, return the valid. :param train: Return the AUC for training data. :param valid: Return the AUC for the validation data. :return: Retrieve the AUC coefficient for this set of metrics """ tm = ModelBase._get_metrics(self,*ModelBase._train_or_valid(train, valid)) if tm is None: return None tm = tm._metric_json return tm.auc()
[docs] def aic(self, train=False, valid=False): """ Get the AIC. If both train and valid are False, return the train. If both train and valid are True, return the valid. :param train: Return the AIC for training data. :param valid: Return the AIC for the validation data. :return: Retrieve the AIC for this set of metrics """ tm = ModelBase._get_metrics(self,*ModelBase._train_or_valid(train, valid)) if tm is None: return None tm = tm._metric_json return tm["AIC"]
[docs] def giniCoef(self, train=False, valid=False): """ Get the Gini. If both train and valid are False, return the train. If both train and valid are True, return the valid. :param train: Return the Gini for training data. :param valid: Return the Gini for the validation data. :return: Retrieve the Gini coefficient for this set of metrics """ tm = ModelBase._get_metrics(self, *ModelBase._train_or_valid(train, valid)) if tm is None: return None tm = tm._metric_json return tm.giniCoef()
[docs] def download_pojo(self,path=""): """ Download the POJO for this model to the directory specified by path (no trailing slash!). If path is "", then dump to screen. :param model: Retrieve this model's scoring POJO. :param path: An absolute path to the directory where POJO should be saved. :return: None """ h2o.download_pojo(self,path) # call the "package" function
@staticmethod def _get_metrics(o, train, valid): if train: return o._model_json["output"]["training_metrics"] if valid: return o._model_json["output"]["validation_metrics"] raise ValueError("`_get_metrics` demands `train` or `valid` to be True.") @staticmethod def _train_or_valid(train,valid): """ Internal static method. :param train: a boolean for train. Ignored, however. :param valid: a boolean for valid :return: true if train, false if valid. If both are false, return True for train. """ if valid: return [False, True] return [True,False] # Delete from cluster as model goes out of scope def __del__(self): h2o.remove(self._key) @staticmethod def _has(dictionary, key): return key in dictionary and dictionary[key] is not None