# -*- encoding: utf-8 -*-
from __future__ import division, print_function, absolute_import, unicode_literals
import itertools
import h2o
from h2o.job import H2OJob
from h2o.frame import H2OFrame
from h2o.exceptions import H2OValueError
from h2o.estimators.estimator_base import H2OEstimator
from h2o.two_dim_table import H2OTwoDimTable
from h2o.display import H2ODisplay
from h2o.grid.metrics import * # NOQA
from h2o.utils.metaclass import Alias as alias, BackwardsCompatible, Deprecated as deprecated, h2o_meta
from h2o.utils.shared_utils import quoted
from h2o.utils.compatibility import * # NOQA
from h2o.utils.typechecks import assert_is_type, is_type
[docs]@BackwardsCompatible(
instance_attrs=dict(
giniCoef=lambda self, *args, **kwargs: self.gini(*args, **kwargs)
)
)
class H2OGridSearch(h2o_meta()):
"""
Grid Search of a Hyper-Parameter Space for a Model
:param model: The type of model to be explored initialized with optional parameters that will be
unchanged across explored models.
:param hyper_params: A dictionary of string parameters (keys) and a list of values to be explored by grid
search (values).
:param str grid_id: The unique id assigned to the resulting grid object. If none is given, an id will
automatically be generated.
:param search_criteria: The optional dictionary of directives which control the search of the hyperparameter space.
The dictionary can include values for: ``strategy``, ``max_models``, ``max_runtime_secs``, ``stopping_metric``,
``stopping_tolerance``, ``stopping_rounds`` and ``seed``. The default strategy, "Cartesian", covers the entire space of
hyperparameter combinations. If you want to use cartesian grid search, you can leave the search_criteria
argument unspecified. Specify the "RandomDiscrete" strategy to get random search of all the combinations of
your hyperparameters with three ways of specifying when to stop the search: max number of models, max time, and
metric-based early stopping (e.g., stop if MSE hasn’t improved by 0.0001 over the 5 best models).
Examples below::
>>> criteria = {"strategy": "RandomDiscrete", "max_runtime_secs": 600,
... "max_models": 100, "stopping_metric": "AUTO",
... "stopping_tolerance": 0.00001, "stopping_rounds": 5,
... "seed": 123456}
>>> criteria = {"strategy": "RandomDiscrete", "max_models": 42,
... "max_runtime_secs": 28800, "seed": 1234}
>>> criteria = {"strategy": "RandomDiscrete", "stopping_metric": "AUTO",
... "stopping_tolerance": 0.001, "stopping_rounds": 10}
>>> criteria = {"strategy": "RandomDiscrete", "stopping_rounds": 5,
... "stopping_metric": "misclassification",
... "stopping_tolerance": 0.00001}
:param parallelism: Level of parallelism during grid model building. 1 = sequential building (default).
Use the value of 0 for adaptive parallelism - decided by H2O. Any number > 1 sets the exact number of models
built in parallel.
:returns: a new H2OGridSearch instance
Examples
--------
>>> from h2o.grid.grid_search import H2OGridSearch
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> hyper_parameters = {'alpha': [0.01,0.5], 'lambda': [1e-5,1e-6]}
>>> gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters)
>>> training_data = h2o.import_file("smalldata/logreg/benign.csv")
>>> gs.train(x=range(3) + range(4,11),y=3, training_frame=training_data)
>>> gs.show()
"""
def __init__(self, model, hyper_params, grid_id=None, search_criteria=None, export_checkpoints_dir=None,
parallelism=1):
assert_is_type(model, None, H2OEstimator, lambda mdl: issubclass(mdl, H2OEstimator))
assert_is_type(hyper_params, dict)
assert_is_type(grid_id, None, str)
assert_is_type(search_criteria, None, dict)
if not (model is None or is_type(model, H2OEstimator)): model = model()
self._id = grid_id
self.model = model
self.hyper_params = dict(hyper_params)
self.search_criteria = None if search_criteria is None else dict(search_criteria)
self.export_checkpoints_dir = export_checkpoints_dir
self._parallelism = parallelism # Degree of parallelism during model building
self._grid_json = None
self.models = None # list of H2O Estimator instances
self._parms = {} # internal, for object recycle #
self.parms = {} # external#
self._future = False # used by __repr__/show to query job state#
self._job = None # used when _future is True#
@property
def grid_id(self):
"""A key that identifies this grid search object in H2O."""
return self._id
@grid_id.setter
def grid_id(self, value):
oldname = self.grid_id
self._id = value
h2o.rapids('(rename "{}" "{}")'.format(oldname, value))
@property
def model_ids(self):
return [i['name'] for i in self._grid_json["model_ids"]]
@property
def hyper_names(self):
return self._grid_json["hyper_names"]
@property
def failed_params(self):
return self._grid_json.get("failed_params", None)
@property
def failure_details(self):
return self._grid_json.get("failure_details", None)
@property
def failure_stack_traces(self):
return self._grid_json.get("failure_stack_traces", None)
@property
def failed_raw_params(self):
return self._grid_json.get("failed_raw_params", None)
[docs] def start(self, x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None,
validation_frame=None, **params):
"""
Asynchronous model build by specifying the predictor columns, response column, and any
additional frame-specific values.
To block for results, call :meth:`join`.
:param x: A list of column names or indices indicating the predictor columns.
:param y: An index or a column name indicating the response column.
:param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
additional columns specified by fold, offset, and weights).
:param offset_column: The name or index of the column in training_frame that holds the offsets.
:param fold_column: The name or index of the column in training_frame that holds the per-row fold
assignments.
:param weights_column: The name or index of the column in training_frame that holds the per-row weights.
:param validation_frame: H2OFrame with validation data to be scored on while training.
"""
self._future = True
self.train(x=x,
y=y,
training_frame=training_frame,
offset_column=offset_column,
fold_column=fold_column,
weights_column=weights_column,
validation_frame=validation_frame,
**params)
[docs] def join(self):
"""Wait until grid finishes computing."""
self._future = False
self._job.poll()
self._job = None
[docs] def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None,
validation_frame=None, **params):
"""
Train the model synchronously (i.e. do not return until the model finishes training).
To train asynchronously call :meth:`start`.
:param x: A list of column names or indices indicating the predictor columns.
:param y: An index or a column name indicating the response column.
:param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
additional columns specified by fold, offset, and weights).
:param offset_column: The name or index of the column in training_frame that holds the offsets.
:param fold_column: The name or index of the column in training_frame that holds the per-row fold
assignments.
:param weights_column: The name or index of the column in training_frame that holds the per-row weights.
:param validation_frame: H2OFrame with validation data to be scored on while training.
"""
algo_params = locals()
parms = self._parms.copy()
parms.update({k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"]})
# dictionaries have special handling in grid search, avoid the implicit conversion
parms["search_criteria"] = None if self.search_criteria is None else str(self.search_criteria)
parms["export_checkpoints_dir"] = self.export_checkpoints_dir
parms["parallelism"] = self._parallelism
parms["hyper_parameters"] = None if self.hyper_params is None else str(self.hyper_params) # unique to grid search
parms.update({k: v for k, v in list(self.model._parms.items()) if v is not None}) # unique to grid search
parms.update(params)
if '__class__' in parms: # FIXME: hackt for PY3
del parms['__class__']
y = algo_params["y"]
tframe = algo_params["training_frame"]
if tframe is None: raise ValueError("Missing training_frame")
if y is not None:
if is_type(y, list, tuple):
if len(y) == 1:
parms["y"] = y[0]
else:
raise ValueError('y must be a single column reference')
if x is None:
if(isinstance(y, int)):
xset = set(range(training_frame.ncols)) - {y}
else:
xset = set(training_frame.names) - {y}
else:
xset = set()
if is_type(x, int, str): x = [x]
for xi in x:
if is_type(xi, int):
if not (-training_frame.ncols <= xi < training_frame.ncols):
raise H2OValueError("Column %d does not exist in the training frame" % xi)
xset.add(training_frame.names[xi])
else:
if xi not in training_frame.names:
raise H2OValueError("Column %s not in the training frame" % xi)
xset.add(xi)
x = list(xset)
parms["x"] = x
self.build_model(parms)
[docs] def build_model(self, algo_params):
"""(internal)"""
if algo_params["training_frame"] is None: raise ValueError("Missing training_frame")
x = algo_params.pop("x")
y = algo_params.pop("y", None)
training_frame = algo_params.pop("training_frame")
validation_frame = algo_params.pop("validation_frame", None)
is_auto_encoder = (algo_params is not None) and ("autoencoder" in algo_params and algo_params["autoencoder"])
algo = self.model._compute_algo() # unique to grid search
is_unsupervised = is_auto_encoder or algo == "pca" or algo == "svd" or algo == "kmeans" or algo == "glrm"
if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
if not is_unsupervised and y is None: raise ValueError("Missing response")
if not is_unsupervised:
y = y if y in training_frame.names else training_frame.names[y]
self.model._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
self._model_build(x, y, training_frame, validation_frame, algo_params)
def _model_build(self, x, y, tframe, vframe, kwargs):
kwargs['training_frame'] = tframe
if vframe is not None: kwargs["validation_frame"] = vframe
if is_type(y, int): y = tframe.names[y]
if y is not None: kwargs['response_column'] = y
if not is_type(x, list, tuple): x = [x]
if is_type(x[0], int):
x = [tframe.names[i] for i in x]
offset = kwargs["offset_column"]
folds = kwargs["fold_column"]
weights = kwargs["weights_column"]
ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
kwargs[k] is not None]) # gruesome one-liner
algo = self.model._compute_algo() # unique to grid search
if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None
grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))
if self._future:
self._job = grid
return
grid.poll()
grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
failure_messages_stacks = ""
error_index = 0
if len(grid_json["failure_details"]) > 0:
print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here
for error_message in grid_json["failure_details"]:
if isinstance(grid_json["failed_params"][error_index], dict):
for h_name in grid_json['hyper_names']:
print("Hyper-parameter: {0}, {1}".format(h_name,
grid_json['failed_params'][error_index][h_name]))
if len(grid_json["failure_stack_traces"]) > error_index:
print("failure_details: {0}\nfailure_stack_traces: "
"{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
failure_messages_stacks += error_message+'\n'
error_index += 1
self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]
for model in self.models:
model._estimator_type = self.model._estimator_type
# get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
# sometimes no model is returned due to bad parameter values provided by the user.
if len(grid_json['model_ids']) > 0:
first_model_json = h2o.api("GET /%d/Models/%s" %
(rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
self._resolve_grid(grid.dest_key, grid_json, first_model_json)
else:
if len(failure_messages_stacks)>0:
raise ValueError(failure_messages_stacks)
else:
raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
def _resolve_grid(self, grid_id, grid_json, first_model_json):
model_class = H2OGridSearch._metrics_class(first_model_json)
m = model_class()
m._id = grid_id
m._grid_json = grid_json
# m._metrics_class = metrics_class
m._parms = self._parms
self.export_checkpoints_dir = m._grid_json["export_checkpoints_dir"]
H2OEstimator.mixin(self, model_class)
self.__dict__.update(m.__dict__.copy())
def __getitem__(self, item):
return self.models[item]
def __iter__(self):
nmodels = len(self.models)
return (self[i] for i in range(nmodels))
def __len__(self):
return len(self.models)
def __repr__(self):
self.show()
return ""
[docs] def predict(self, test_data):
"""
Predict on a dataset.
:param H2OFrame test_data: Data to be predicted on.
:returns: H2OFrame filled with predictions.
"""
return {model.model_id: model.predict(test_data) for model in self.models}
[docs] def is_cross_validated(self):
"""Return True if the model was cross-validated."""
return {model.model_id: model.is_cross_validated() for model in self.models}
[docs] def xval_keys(self):
"""Model keys for the cross-validated model."""
return {model.model_id: model.xval_keys() for model in self.models}
[docs] def get_xval_models(self, key=None):
"""
Return a Model object.
:param str key: If None, return all cross-validated models; otherwise return the model
specified by the key.
:returns: A model or a list of models.
"""
return {model.model_id: model.get_xval_models(key) for model in self.models}
[docs] def xvals(self):
"""Return the list of cross-validated models."""
return {model.model_id: model.xvals for model in self.models}
[docs] def deepfeatures(self, test_data, layer):
"""
Obtain a hidden layer's details on a dataset.
:param test_data: Data to create a feature space on.
:param int layer: Index of the hidden layer.
:returns: A dictionary of hidden layer details for each model.
"""
return {model.model_id: model.deepfeatures(test_data, layer) for model in self.models}
[docs] def weights(self, matrix_id=0):
"""
Return the frame for the respective weight matrix.
:param: matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return.
:returns: an H2OFrame which represents the weight matrix identified by matrix_id
"""
return {model.model_id: model.weights(matrix_id) for model in self.models}
[docs] def biases(self, vector_id=0):
"""
Return the frame for the respective bias vector.
:param: vector_id: an integer, ranging from 0 to number of layers, that specifies the bias vector to return.
:returns: an H2OFrame which represents the bias vector identified by vector_id
"""
return {model.model_id: model.biases(vector_id) for model in self.models}
[docs] def normmul(self):
"""Normalization/Standardization multipliers for numeric predictors."""
return {model.model_id: model.normmul() for model in self.models}
[docs] def normsub(self):
"""Normalization/Standardization offsets for numeric predictors."""
return {model.model_id: model.normsub() for model in self.models}
[docs] def respmul(self):
"""Normalization/Standardization multipliers for numeric response."""
return {model.model_id: model.respmul() for model in self.models}
[docs] def respsub(self):
"""Normalization/Standardization offsets for numeric response."""
return {model.model_id: model.respsub() for model in self.models}
[docs] def catoffsets(self):
"""
Categorical offsets for one-hot encoding
"""
return {model.model_id: model.catoffsets() for model in self.models}
[docs] def scoring_history(self):
"""
Retrieve model scoring history.
:returns: Score history (H2OTwoDimTable)
"""
return {model.model_id: model.scoring_history() for model in self.models}
[docs] def summary(self, header=True):
"""Print a detailed summary of the explored models."""
table = []
for model in self.models:
model_summary = model._model_json["output"]["model_summary"]
r_values = list(model_summary.cell_values[0])
r_values[0] = model.model_id
table.append(r_values)
# if h2o.can_use_pandas():
# import pandas
# pandas.options.display.max_rows = 20
# print pandas.DataFrame(table,columns=self.col_header)
# return
print()
if header:
print('Grid Summary:')
print()
H2ODisplay(table, header=['Model Id'] + model_summary.col_header[1:], numalign="left", stralign="left")
[docs] def show(self):
"""Print models sorted by metric."""
hyper_combos = itertools.product(*list(self.hyper_params.values()))
if not self.models:
c_values = [[idx + 1, list(val)] for idx, val in enumerate(hyper_combos)]
print(H2OTwoDimTable(
col_header=['Model', 'Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']'],
table_header='Grid Search of Model ' + self.model.__class__.__name__, cell_values=c_values))
else:
print(self.sorted_metric_table())
[docs] def varimp(self, use_pandas=False):
"""
Pretty print the variable importances, or return them in a list/pandas DataFrame.
:param bool use_pandas: If True, then the variable importances will be returned as a pandas data frame.
:returns: A dictionary of lists or Pandas DataFrame instances.
"""
return {model.model_id: model.varimp(use_pandas) for model in self.models}
[docs] def residual_deviance(self, train=False, valid=False, xval=False):
"""
Retreive the residual deviance if this model has the attribute, or None otherwise.
:param bool train: Get the residual deviance for the training set. If both train and valid are False,
then train is selected by default.
:param bool valid: Get the residual deviance for the validation set. If both train and valid are True,
then train is selected by default.
:param bool xval: Get the residual deviance for the cross-validated models.
:returns: the residual deviance, or None if it is not present.
"""
return {model.model_id: model.residual_deviance(train, valid, xval) for model in self.models}
[docs] def residual_degrees_of_freedom(self, train=False, valid=False, xval=False):
"""
Retreive the residual degress of freedom if this model has the attribute, or None otherwise.
:param bool train: Get the residual dof for the training set. If both train and valid are False, then
train is selected by default.
:param bool valid: Get the residual dof for the validation set. If both train and valid are True, then
train is selected by default.
:param bool xval: Get the residual dof for the cross-validated models.
:returns: the residual degrees of freedom, or None if they are not present.
"""
return {model.model_id: model.residual_degrees_of_freedom(train, valid, xval) for model in self.models}
[docs] def null_deviance(self, train=False, valid=False, xval=False):
"""
Retreive the null deviance if this model has the attribute, or None otherwise.
:param bool train: Get the null deviance for the training set. If both train and valid are False, then
train is selected by default.
:param bool valid: Get the null deviance for the validation set. If both train and valid are True, then
train is selected by default.
:param bool xval: Get the null deviance for the cross-validated models.
:returns: the null deviance, or None if it is not present.
"""
return {model.model_id: model.null_deviance(train, valid, xval) for model in self.models}
[docs] def null_degrees_of_freedom(self, train=False, valid=False, xval=False):
"""
Retreive the null degress of freedom if this model has the attribute, or None otherwise.
:param bool train: Get the null dof for the training set. If both train and valid are False, then train is
selected by default.
:param bool valid: Get the null dof for the validation set. If both train and valid are True, then train is
selected by default.
:param bool xval: Get the null dof for the cross-validated models.
:returns: the null dof, or None if it is not present.
"""
return {model.model_id: model.null_degrees_of_freedom(train, valid, xval) for model in self.models}
[docs] def pprint_coef(self):
"""Pretty print the coefficents table (includes normalized coefficients)."""
for i, model in enumerate(self.models):
print('Model', i)
model.pprint_coef()
print()
[docs] def coef(self):
"""Return the coefficients that can be applied to the non-standardized data.
Note: standardize = True by default. If set to False, then coef() returns the coefficients that are fit directly.
"""
return {model.model_id: model.coef() for model in self.models}
[docs] def coef_norm(self):
"""Return coefficients fitted on the standardized data (requires standardize = True, which is on by default). These coefficients can be used to evaluate variable importance.
"""
return {model.model_id: model.coef_norm() for model in self.models}
[docs] def r2(self, train=False, valid=False, xval=False):
"""
Return the R^2 for this regression model.
The R^2 value is defined to be ``1 - MSE/var``, where ``var`` is computed as ``sigma^2``.
If all are False (default), then return the training metric value.
If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
"valid", and "xval".
:param bool train: If train is True, then return the R^2 value for the training data.
:param bool valid: If valid is True, then return the R^2 value for the validation data.
:param bool xval: If xval is True, then return the R^2 value for the cross validation data.
:returns: The R^2 for this regression model.
"""
return {model.model_id: model.r2(train, valid, xval) for model in self.models}
[docs] def mse(self, train=False, valid=False, xval=False):
"""
Get the MSE(s).
If all are False (default), then return the training metric value.
If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
"valid", and "xval".
:param bool train: If train is True, then return the MSE value for the training data.
:param bool valid: If valid is True, then return the MSE value for the validation data.
:param bool xval: If xval is True, then return the MSE value for the cross validation data.
:returns: The MSE for this regression model.
"""
return {model.model_id: model.mse(train, valid, xval) for model in self.models}
def rmse(self, train=False, valid=False, xval=False):
return {model.model_id: model.rmse(train, valid, xval) for model in self.models}
def mae(self, train=False, valid=False, xval=False):
return {model.model_id: model.mae(train, valid, xval) for model in self.models}
def rmsle(self, train=False, valid=False, xval=False):
return {model.model_id: model.rmsle(train, valid, xval) for model in self.models}
[docs] def logloss(self, train=False, valid=False, xval=False):
"""
Get the Log Loss(s).
If all are False (default), then return the training metric value.
If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
"valid", and "xval".
:param bool train: If train is True, then return the Log Loss value for the training data.
:param bool valid: If valid is True, then return the Log Loss value for the validation data.
:param bool xval: If xval is True, then return the Log Loss value for the cross validation data.
:returns: The Log Loss for this binomial model.
"""
return {model.model_id: model.logloss(train, valid, xval) for model in self.models}
[docs] def mean_residual_deviance(self, train=False, valid=False, xval=False):
"""
Get the Mean Residual Deviances(s).
If all are False (default), then return the training metric value.
If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
"valid", and "xval".
:param bool train: If train is True, then return the Mean Residual Deviance value for the training data.
:param bool valid: If valid is True, then return the Mean Residual Deviance value for the validation data.
:param bool xval: If xval is True, then return the Mean Residual Deviance value for the cross validation data.
:returns: The Mean Residual Deviance for this regression model.
"""
return {model.model_id: model.mean_residual_deviance(train, valid, xval) for model in self.models}
[docs] def auc(self, train=False, valid=False, xval=False):
"""
Get the AUC(s).
If all are False (default), then return the training metric value.
If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
"valid", and "xval".
:param bool train: If train is True, then return the AUC value for the training data.
:param bool valid: If valid is True, then return the AUC value for the validation data.
:param bool xval: If xval is True, then return the AUC value for the validation data.
:returns: The AUC.
"""
return {model.model_id: model.auc(train, valid, xval) for model in self.models}
[docs] def aic(self, train=False, valid=False, xval=False):
"""
Get the AIC(s).
If all are False (default), then return the training metric value.
If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
"valid", and "xval".
:param bool train: If train is True, then return the AIC value for the training data.
:param bool valid: If valid is True, then return the AIC value for the validation data.
:param bool xval: If xval is True, then return the AIC value for the validation data.
:returns: The AIC.
"""
return {model.model_id: model.aic(train, valid, xval) for model in self.models}
[docs] def gini(self, train=False, valid=False, xval=False):
"""
Get the Gini Coefficient(s).
If all are False (default), then return the training metric value.
If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
"valid", and "xval".
:param bool train: If train is True, then return the Gini Coefficient value for the training data.
:param bool valid: If valid is True, then return the Gini Coefficient value for the validation data.
:param bool xval: If xval is True, then return the Gini Coefficient value for the cross validation data.
:returns: The Gini Coefficient for the models in this grid.
"""
return {model.model_id: model.gini(train, valid, xval) for model in self.models}
# @alias('pr_auc')
[docs] def aucpr(self, train=False, valid=False, xval=False):
"""
Get the aucPR (Area Under PRECISION RECALL Curve).
If all are False (default), then return the training metric value.
If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
"valid", and "xval".
:param bool train: If train is True, then return the aucpr value for the training data.
:param bool valid: If valid is True, then return the aucpr value for the validation data.
:param bool xval: If xval is True, then return the aucpr value for the validation data.
:returns: The AUCPR for the models in this grid.
"""
return {model.model_id: model.aucpr(train, valid, xval) for model in self.models}
[docs] @deprecated(replaced_by=aucpr)
def pr_auc(self):
pass
[docs] def get_hyperparams(self, id, display=True):
"""
Get the hyperparameters of a model explored by grid search.
:param str id: The model id of the model with hyperparameters of interest.
:param bool display: Flag to indicate whether to display the hyperparameter names.
:returns: A list of the hyperparameters for the specified model.
"""
idx = id if is_type(id, int) else self.model_ids.index(id)
model = self[idx]
# if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs
# parameter and not the main model that is returned.
if model._is_xvalidated:
model = h2o.get_model(model._xval_keys[0])
res = [model.params[h]['actual'][0] if isinstance(model.params[h]['actual'], list)
else model.params[h]['actual']
for h in self.hyper_params]
if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']')
return res
[docs] def get_hyperparams_dict(self, id, display=True):
"""
Derived and returned the model parameters used to train the particular grid search model.
:param str id: The model id of the model with hyperparameters of interest.
:param bool display: Flag to indicate whether to display the hyperparameter names.
:returns: A dict of model pararmeters derived from the hyper-parameters used to train this particular model.
"""
idx = id if is_type(id, int) else self.model_ids.index(id)
model = self[idx]
model_params = dict()
# if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs
# parameter and not the main model that is returned.
if model._is_xvalidated:
model = h2o.get_model(model._xval_keys[0])
for param_name in self.hyper_names:
model_params[param_name] = model.params[param_name]['actual'][0] if \
isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual']
if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']')
return model_params
[docs] def sorted_metric_table(self):
"""
Retrieve summary table of an H2O Grid Search.
:returns: The summary table as an H2OTwoDimTable or a Pandas DataFrame.
"""
summary = self._grid_json["summary_table"]
if summary is not None: return summary.as_data_frame()
print("No sorted metric table for this grid search")
@staticmethod
def _metrics_class(model_json):
model_type = model_json["output"]["model_category"]
if model_type == "Binomial":
model_class = H2OBinomialGridSearch
elif model_type == "Clustering":
model_class = H2OClusteringGridSearch
elif model_type == "Regression":
model_class = H2ORegressionGridSearch
elif model_type == "Multinomial":
model_class = H2OMultinomialGridSearch
elif model_type == "Ordinal":
model_class = H2OOrdinalGridSearch
elif model_type == "AutoEncoder":
model_class = H2OAutoEncoderGridSearch
elif model_type == "DimReduction":
model_class = H2ODimReductionGridSearch
else:
raise NotImplementedError(model_type)
return model_class
[docs] def get_grid(self, sort_by=None, decreasing=None):
"""
Retrieve an H2OGridSearch instance.
Optionally specify a metric by which to sort models and a sort order.
Note that if neither cross-validation nor a validation frame is used in the grid search, then the
training metrics will display in the "get grid" output. If a validation frame is passed to the grid, and
``nfolds = 0``, then the validation metrics will display. However, if ``nfolds`` > 1, then cross-validation
metrics will display even if a validation frame is provided.
:param str sort_by: A metric by which to sort the models in the grid space. Choices are: ``"logloss"``,
``"residual_deviance"``, ``"mse"``, ``"auc"``, ``"r2"``, ``"accuracy"``, ``"precision"``, ``"recall"``,
``"f1"``, etc.
:param bool decreasing: Sort the models in decreasing order of metric if true, otherwise sort in increasing
order (default).
:returns: A new H2OGridSearch instance optionally sorted on the specified metric.
"""
if sort_by is None and decreasing is None: return self
grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing})
grid = H2OGridSearch(self.model, self.hyper_params, self._id)
grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # reordered
first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0]
model_class = H2OGridSearch._metrics_class(first_model_json)
m = model_class()
m._id = self._id
m._grid_json = grid_json
# m._metrics_class = metrics_class
m._parms = grid._parms
H2OEstimator.mixin(grid, model_class)
grid.__dict__.update(m.__dict__.copy())
return grid
[docs] @deprecated("grid.sort_by() is deprecated; use grid.get_grid() instead")
def sort_by(self, metric, increasing=True):
"""Deprecated since 2016-12-12, use grid.get_grid() instead."""
if metric[-1] != ')': metric += '()'
c_values = [list(x) for x in zip(*sorted(eval('self.' + metric + '.items()'), key=lambda k_v: k_v[1]))]
c_values.insert(1, [self.get_hyperparams(model_id, display=False) for model_id in c_values[0]])
if not increasing:
for col in c_values: col.reverse()
if metric[-2] == '(': metric = metric[:-2]
return H2OTwoDimTable(
col_header=['Model Id', 'Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']', metric],
table_header='Grid Search Results for ' + self.model.__class__.__name__,
cell_values=[list(x) for x in zip(*c_values)])