# -*- encoding: utf-8 -*-
"""
Regression model.
:copyright: (c) 2016 H2O.ai
:license: Apache License Version 2.0 (see LICENSE for details)
"""
from __future__ import absolute_import, division, print_function, unicode_literals
from h2o.model.confusion_matrix import ConfusionMatrix
from h2o.plot import decorate_plot_result, get_matplotlib_pyplot, RAISE_ON_FIGURE_ACCESS
from h2o.utils.compatibility import * # NOQA
from h2o.utils.metaclass import backwards_compatibility, deprecated_fn, h2o_meta, deprecated_params
from h2o.utils.typechecks import assert_is_type, assert_satisfies, is_type, numeric
[docs]@backwards_compatibility(
instance_attrs=dict(
giniCoef=lambda self, *args, **kwargs: self.gini(*args, **kwargs)
)
)
class MetricsBase(h2o_meta()):
"""
A parent class to house common metrics available for the various Metrics types.
The methods here are available across different model categories.
"""
def __init__(self, metric_json, on=None, algo=""):
# Yep, it's messed up...
if isinstance(metric_json, MetricsBase): metric_json = metric_json._metric_json
self._metric_json = metric_json
# train and valid and xval are not mutually exclusive -- could have a test. train and
# valid only make sense at model build time.
self._on_train = False
self._on_valid = False
self._on_xval = False
self._algo = algo
if on == "training_metrics":
self._on_train = True
elif on == "validation_metrics":
self._on_valid = True
elif on == "cross_validation_metrics":
self._on_xval = True
elif on is None:
pass
else:
raise ValueError("on expected to be train,valid,or xval. Got: " + str(on))
[docs] @classmethod
def make(cls, kvs):
"""Factory method to instantiate a MetricsBase object from the list of key-value pairs."""
return cls(metric_json=dict(kvs))
def __repr__(self):
# FIXME !!! __repr__ should never print anything, but return a string
self.show()
return ""
# TODO: convert to actual fields list
def __getitem__(self, key):
return self._metric_json.get(key)
@staticmethod
def _has(dictionary, key):
return key in dictionary and dictionary[key] is not None
[docs] def show(self):
"""Display a short summary of the metrics.
:examples:
>>> from from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.show()
"""
if self._metric_json==None:
print("WARNING: Model metrics cannot be calculated and metric_json is empty due to the absence of the response column in your dataset.")
return
metric_type = self._metric_json['__meta']['schema_type']
types_w_glm = ['ModelMetricsRegressionGLM', 'ModelMetricsRegressionGLMGeneric', 'ModelMetricsBinomialGLM',
'ModelMetricsBinomialGLMGeneric', 'ModelMetricsHGLMGaussianGaussian',
'ModelMetricsHGLMGaussianGaussianGeneric']
types_w_clustering = ['ModelMetricsClustering']
types_w_mult = ['ModelMetricsMultinomial', 'ModelMetricsMultinomialGeneric']
types_w_ord = ['ModelMetricsOrdinal', 'ModelMetricsOrdinalGeneric']
types_w_bin = ['ModelMetricsBinomial', 'ModelMetricsBinomialGeneric', 'ModelMetricsBinomialGLM', 'ModelMetricsBinomialGLMGeneric']
types_w_r2 = ['ModelMetricsRegressionGLM', 'ModelMetricsRegressionGLMGeneric']
types_w_mean_residual_deviance = ['ModelMetricsRegressionGLM', 'ModelMetricsRegressionGLMGeneric',
'ModelMetricsRegression', 'ModelMetricsRegressionGeneric']
types_w_mean_absolute_error = ['ModelMetricsRegressionGLM', 'ModelMetricsRegressionGLMGeneric',
'ModelMetricsRegression', 'ModelMetricsRegressionGeneric']
types_w_mean_per_class_error = ['ModelMetricsBinomial', 'ModelMetricsBinomialGeneric',
'ModelMetricsOrdinal', 'ModelMetricsOrdinalGeneric'] + types_w_mult
types_w_logloss = types_w_bin + types_w_mult+types_w_ord
types_w_dim = ["ModelMetricsGLRM"]
types_w_anomaly = ['ModelMetricsAnomaly']
types_w_cox = ['ModelMetricsRegressionCoxPH']
types_w_uplift = ['ModelMetricsBinomialUplift']
print()
print(metric_type + ": " + self._algo)
reported_on = "** Reported on {} data. **"
if self._on_train:
print(reported_on.format("train"))
elif self._on_valid:
print(reported_on.format("validation"))
elif self._on_xval:
print(reported_on.format("cross-validation"))
else:
print(reported_on.format("test"))
print()
if metric_type not in types_w_anomaly and metric_type not in types_w_uplift and \
metric_type not in types_w_clustering:
print("MSE: " + str(self.mse()))
print("RMSE: " + str(self.rmse()))
if metric_type in types_w_mean_absolute_error:
print("MAE: " + str(self.mae()))
print("RMSLE: " + str(self.rmsle()))
if metric_type in types_w_r2:
print("R^2: " + str(self.r2()))
if metric_type in types_w_mean_residual_deviance:
print("Mean Residual Deviance: " + str(self.mean_residual_deviance()))
if metric_type in types_w_logloss:
print("LogLoss: " + str(self.logloss()))
if metric_type in types_w_mean_per_class_error:
print("Mean Per-Class Error: %s" % self._mean_per_class_error())
if metric_type in types_w_glm:
if metric_type == 'ModelMetricsHGLMGaussianGaussian': # print something for HGLM
print("Standard error of fixed columns: "+str(self.hglm_metric("sefe")))
print("Standard error of random columns: "+str(self.hglm_metric("sere")))
print("Coefficients for fixed columns: "+str(self.hglm_metric("fixedf")))
print("Coefficients for random columns: "+str(self.hglm_metric("ranef")))
print("Random column indices: "+str(self.hglm_metric("randc")))
print("Dispersion parameter of the mean model (residual variance for LMM): "+str(self.hglm_metric("varfix")))
print("Dispersion parameter of the random columns (variance of random columns): "+str(self.hglm_metric("varranef")))
print("Convergence reached for algorithm: "+str(self.hglm_metric("converge")))
print("Deviance degrees of freedom for mean part of the model: "+str(self.hglm_metric("dfrefe")))
print("Estimates and standard errors of the linear prediction in the dispersion model: "+str(self.hglm_metric("summvc1")))
print("Estimates and standard errors of the linear predictor for the dispersion parameter of the random columns: "+str(self.hglm_metric("summvc2")))
print("Index of most influential observation (-1 if none): "+str(self.hglm_metric("bad")))
print("H-likelihood: "+str(self.hglm_metric("hlik")))
print("Profile log-likelihood profiled over random columns: "+str(self.hglm_metric("pvh")))
print("Adjusted profile log-likelihood profiled over fixed and random effects: "+str(self.hglm_metric("pbvh")))
print("Conditional AIC: "+str(self.hglm_metric("caic")))
else:
print("Null degrees of freedom: " + str(self.null_degrees_of_freedom()))
print("Residual degrees of freedom: " + str(self.residual_degrees_of_freedom()))
print("Null deviance: " + str(self.null_deviance()))
print("Residual deviance: " + str(self.residual_deviance()))
print("AIC: " + str(self.aic()))
if metric_type in types_w_bin:
print("AUC: " + str(self.auc()))
print("AUCPR: " + str(self.aucpr()))
print("Gini: " + str(self.gini()))
if self.confusion_matrix():
self.confusion_matrix().show()
if self._metric_json["max_criteria_and_metric_scores"]:
self._metric_json["max_criteria_and_metric_scores"].show()
if self.gains_lift():
print(self.gains_lift())
if metric_type in types_w_mult:
print("AUC: " + str(self.auc()))
print("AUCPR: " + str(self.aucpr()))
# AUC and PR AUC table cannot be computed due domain size
if self._metric_json["multinomial_auc_table"] is not None:
self._metric_json["multinomial_auc_table"].show()
else:
print("Multinomial auc values: Table is not computed because it is disabled (model parameter 'auc_type' is set to AUTO or NONE) or due to domain size (maximum is 50 domains).")
if self._metric_json["multinomial_aucpr_table"] is not None:
self._metric_json["multinomial_aucpr_table"].show()
else:
print("Multinomial auc_pr values: Table is not computed because it is disabled (model parameter 'auc_type' is set to AUTO or NONE) or due to domain size (maximum is 50 domains).")
if metric_type in types_w_anomaly:
print("Anomaly Score: " + str(self.mean_score()))
print("Normalized Anomaly Score: " + str(self.mean_normalized_score()))
if (metric_type in types_w_mult) or (metric_type in types_w_ord):
self.confusion_matrix().show()
self.hit_ratio_table().show()
if metric_type in types_w_clustering:
print("Total Within Cluster Sum of Square Error: " + str(self.tot_withinss()))
print("Total Sum of Square Error to Grand Mean: " + str(self.totss()))
print("Between Cluster Sum of Square Error: " + str(self.betweenss()))
if self._metric_json['centroid_stats'] is not None:
self._metric_json['centroid_stats'].show()
else:
print("Centroid stats are not available.")
if metric_type in types_w_dim:
print("Sum of Squared Error (Numeric): " + str(self.num_err()))
print("Misclassification Error (Categorical): " + str(self.cat_err()))
if metric_type in types_w_cox:
print("Concordance score: " + str(self.concordance()))
print("Concordant count: " + str(self.concordant()))
print("Tied cout: " + str(self.tied_y()))
if metric_type in types_w_uplift:
print("AUUC: " + str(self.auuc()))
if self.custom_metric_name():
print("{}: {}".format(self.custom_metric_name(), self.custom_metric_value()))
[docs] def r2(self):
"""The R squared coefficient.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.r2()
"""
return self._metric_json["r2"]
[docs] def logloss(self):
"""Log loss.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.logloss()
"""
return self._metric_json["logloss"]
[docs] def nobs(self):
"""
The number of observations.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> perf = cars_gbm.model_performance()
>>> perf.nobs()
"""
return self._metric_json["nobs"]
[docs] def mean_residual_deviance(self):
"""The mean residual deviance for this set of metrics.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip")
>>> air_gbm = H2OGradientBoostingEstimator()
>>> air_gbm.train(x=list(range(9)),
... y=9,
... training_frame=airlines,
... validation_frame=airlines)
>>> air_gbm.mean_residual_deviance(train=True,valid=False,xval=False)
"""
return self._metric_json["mean_residual_deviance"]
[docs] def auc(self):
"""The AUC for this set of metrics.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.auc()
"""
return self._metric_json['AUC']
[docs] def aucpr(self):
"""The area under the precision recall curve.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.aucpr()
"""
return self._metric_json['pr_auc']
[docs] @deprecated_fn(replaced_by=aucpr)
def pr_auc(self):
pass
[docs] def aic(self):
"""The AIC for this set of metrics.
:examples:
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate[2] = prostate[2].asfactor()
>>> prostate[4] = prostate[4].asfactor()
>>> prostate[5] = prostate[5].asfactor()
>>> prostate[8] = prostate[8].asfactor()
>>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
>>> response = "CAPSULE"
>>> train, valid = prostate.split_frame(ratios=[.8],seed=1234)
>>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial")
>>> pros_glm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> pros_glm.aic()
"""
return self._metric_json['AIC']
[docs] def gini(self):
"""Gini coefficient.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.gini()
"""
return self._metric_json['Gini']
[docs] def mse(self):
"""The MSE for this set of metrics.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.mse()
"""
return self._metric_json['MSE']
[docs] def rmse(self):
"""The RMSE for this set of metrics.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.rmse()
"""
return self._metric_json['RMSE']
[docs] def mae(self):
"""The MAE for this set of metrics.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(distribution = "poisson",
... seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.mae()
"""
return self._metric_json['mae']
[docs] def rmsle(self):
"""The RMSLE for this set of metrics.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(distribution = "poisson",
... seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.rmsle()
"""
return self._metric_json['rmsle']
[docs] def residual_deviance(self):
"""The residual deviance if the model has it, otherwise None.
:examples:
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate[2] = prostate[2].asfactor()
>>> prostate[4] = prostate[4].asfactor()
>>> prostate[5] = prostate[5].asfactor()
>>> prostate[8] = prostate[8].asfactor()
>>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
>>> response = "CAPSULE"
>>> train, valid = prostate.split_frame(ratios=[.8],seed=1234)
>>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial")
>>> pros_glm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> pros_glm.residual_deviance()
"""
if MetricsBase._has(self._metric_json, "residual_deviance"):
return self._metric_json["residual_deviance"]
return None
[docs] def hglm_metric(self, metric_string):
if MetricsBase._has(self._metric_json, metric_string):
return self._metric_json[metric_string]
return None
[docs] def residual_degrees_of_freedom(self):
"""The residual DoF if the model has residual deviance, otherwise None.
:examples:
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate[2] = prostate[2].asfactor()
>>> prostate[4] = prostate[4].asfactor()
>>> prostate[5] = prostate[5].asfactor()
>>> prostate[8] = prostate[8].asfactor()
>>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
>>> response = "CAPSULE"
>>> train, valid = prostate.split_frame(ratios=[.8],seed=1234)
>>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial")
>>> pros_glm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> pros_glm.residual_degrees_of_freedom()
"""
if MetricsBase._has(self._metric_json, "residual_degrees_of_freedom"):
return self._metric_json["residual_degrees_of_freedom"]
return None
[docs] def null_deviance(self):
"""The null deviance if the model has residual deviance, otherwise None.
:examples:
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate[2] = prostate[2].asfactor()
>>> prostate[4] = prostate[4].asfactor()
>>> prostate[5] = prostate[5].asfactor()
>>> prostate[8] = prostate[8].asfactor()
>>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
>>> response = "CAPSULE"
>>> train, valid = prostate.split_frame(ratios=[.8],seed=1234)
>>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial")
>>> pros_glm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> pros_glm.null_deviance()
"""
if MetricsBase._has(self._metric_json, "null_deviance"):
return self._metric_json["null_deviance"]
return None
[docs] def null_degrees_of_freedom(self):
"""The null DoF if the model has residual deviance, otherwise None.
:examples:
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate[2] = prostate[2].asfactor()
>>> prostate[4] = prostate[4].asfactor()
>>> prostate[5] = prostate[5].asfactor()
>>> prostate[8] = prostate[8].asfactor()
>>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
>>> response = "CAPSULE"
>>> train, valid = prostate.split_frame(ratios=[.8],seed=1234)
>>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial")
>>> pros_glm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> pros_glm.null_degrees_of_freedom()
"""
if MetricsBase._has(self._metric_json, "null_degrees_of_freedom"):
return self._metric_json["null_degrees_of_freedom"]
return None
# private accessor for mean per-class error - the public version is overridden in H2OBinomialModelMetrics with
# a method with different return semantics
def _mean_per_class_error(self):
return self._metric_json['mean_per_class_error']
[docs] def mean_per_class_error(self):
"""The mean per class error.
:examples:
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
>>> prostate[2] = prostate[2].asfactor()
>>> prostate[4] = prostate[4].asfactor()
>>> prostate[5] = prostate[5].asfactor()
>>> prostate[8] = prostate[8].asfactor()
>>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
>>> response = "CAPSULE"
>>> train, valid = prostate.split_frame(ratios=[.8],seed=1234)
>>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial")
>>> pros_glm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> pros_glm.mean_per_class_error()
"""
return self._mean_per_class_error()
[docs] def custom_metric_name(self):
"""Name of custom metric or None."""
if MetricsBase._has(self._metric_json, "custom_metric_name"):
return self._metric_json['custom_metric_name']
else:
return None
[docs] def custom_metric_value(self):
"""Value of custom metric or None."""
if MetricsBase._has(self._metric_json, "custom_metric_value"):
return self._metric_json['custom_metric_value']
else:
return None
[docs]class H2ORegressionModelMetrics(MetricsBase):
"""
This class provides an API for inspecting the metrics returned by a regression model.
It is possible to retrieve the R^2 (1 - MSE/variance) and MSE.
:examples:
>>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "cylinders"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_glm = H2OGeneralizedLinearEstimator()
>>> cars_glm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_glm.mse()
"""
def __init__(self, metric_json, on=None, algo=""):
super(H2ORegressionModelMetrics, self).__init__(metric_json, on, algo)
[docs]class H2OClusteringModelMetrics(MetricsBase):
def __init__(self, metric_json, on=None, algo=""):
super(H2OClusteringModelMetrics, self).__init__(metric_json, on, algo)
[docs] def tot_withinss(self):
"""The Total Within Cluster Sum-of-Square Error, or None if not present.
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> km.tot_withinss()
"""
if MetricsBase._has(self._metric_json, "tot_withinss"):
return self._metric_json["tot_withinss"]
return None
[docs] def totss(self):
"""The Total Sum-of-Square Error to Grand Mean, or None if not present.
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> km.totss()
"""
if MetricsBase._has(self._metric_json, "totss"):
return self._metric_json["totss"]
return None
[docs] def betweenss(self):
"""The Between Cluster Sum-of-Square Error, or None if not present.
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> km.betweenss()
"""
if MetricsBase._has(self._metric_json, "betweenss"):
return self._metric_json["betweenss"]
return None
[docs]class H2OMultinomialModelMetrics(MetricsBase):
def __init__(self, metric_json, on=None, algo=""):
super(H2OMultinomialModelMetrics, self).__init__(metric_json, on, algo)
[docs] def confusion_matrix(self):
"""Returns a confusion matrix based of H2O's default prediction threshold for a dataset.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["cylinders"] = cars["cylinders"].asfactor()
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> response_col = "cylinders"
>>> distribution = "multinomial"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution = distribution)
>>> gbm.train(x=predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> gbm.confusion_matrix(train)
"""
return self._metric_json['cm']['table']
[docs] def hit_ratio_table(self):
"""Retrieve the Hit Ratios.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["cylinders"] = cars["cylinders"].asfactor()
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> response_col = "cylinders"
>>> distribution = "multinomial"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution = distribution)
>>> gbm.train(x=predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> gbm.hit_ratio_table()
"""
return self._metric_json['hit_ratio_table']
[docs] def multinomial_auc_table(self):
"""Retrieve the multinomial AUC values.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["cylinders"] = cars["cylinders"].asfactor()
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> response_col = "cylinders"
>>> distribution = "multinomial"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution = distribution)
>>> gbm.train(x=predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> gbm.multinomial_auc_table()
"""
if self._metric_json['multinomial_auc_table'] is not None:
return self._metric_json['multinomial_auc_table']
else:
return "Table is not computed because it is disabled (model parameter 'auc_type' is set to AUTO or NONE) or due to domain size (maximum is 50 domains)."
[docs] def multinomial_aucpr_table(self):
"""Retrieve the multinomial PR AUC values.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["cylinders"] = cars["cylinders"].asfactor()
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> response_col = "cylinders"
>>> distribution = "multinomial"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution = distribution)
>>> gbm.train(x=predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> gbm.multinomial_aucpr_table()
"""
if self._metric_json['multinomial_aucpr_table'] is not None:
return self._metric_json['multinomial_aucpr_table']
else:
return "Table is not computed because it is disabled (model parameter 'auc_type' is set to AUTO or NONE) or due to domain size (maximum is 50 domains)."
[docs]class H2OOrdinalModelMetrics(MetricsBase):
def __init__(self, metric_json, on=None, algo=""):
super(H2OOrdinalModelMetrics, self).__init__(metric_json, on, algo)
[docs] def confusion_matrix(self):
"""Returns a confusion matrix based of H2O's default prediction threshold for a dataset.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["cylinders"] = cars["cylinders"].asfactor()
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> response_col = "cylinders"
>>> distribution = "multinomial"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution = distribution)
>>> gbm.train(x=predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> gbm.confusion_matrix(train)
"""
return self._metric_json['cm']['table']
[docs] def hit_ratio_table(self):
"""Retrieve the Hit Ratios.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["cylinders"] = cars["cylinders"].asfactor()
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> response_col = "cylinders"
>>> distribution = "multinomial"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution = distribution)
>>> gbm.train(x=predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> gbm.hit_ratio_table()
"""
return self._metric_json['hit_ratio_table']
[docs]class H2OHGLMModelMetrics(MetricsBase):
def __init__(self, metric_json, on=None, algo="HGLM Gaussian Gaussian"):
super(H2OHGLMModelMetrics, self).__init__(metric_json, on, algo)
[docs]class H2OBinomialModelMetrics(MetricsBase):
"""
This class is essentially an API for the AUC object.
This class contains methods for inspecting the AUC for different criteria.
To input the different criteria, use the static variable `criteria`.
"""
def __init__(self, metric_json, on=None, algo=""):
"""
Create a new Binomial Metrics object (essentially a wrapper around some json)
:param metric_json: A blob of json holding all of the needed information
:param on_train: Metrics built on training data (default is False)
:param on_valid: Metrics built on validation data (default is False)
:param on_xval: Metrics built on cross validation data (default is False)
:param algo: The algorithm the metrics are based off of (e.g. deeplearning, gbm, etc.)
:returns: A new H2OBinomialModelMetrics object.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.auc(valid=True)
"""
super(H2OBinomialModelMetrics, self).__init__(metric_json, on, algo)
[docs] def F1(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The F1 for the given set of thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.F1()
"""
return self.metric("f1", thresholds=thresholds)
[docs] def F2(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The F2 for this set of metrics and thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.F2()
"""
return self.metric("f2", thresholds=thresholds)
[docs] def F0point5(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The F0.5 for this set of metrics and thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.F0point5()
"""
return self.metric("f0point5", thresholds=thresholds)
[docs] def accuracy(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The accuracy for this set of metrics and thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.accuracy()
"""
return self.metric("accuracy", thresholds=thresholds)
[docs] def error(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold minimizing the error will be used.
:returns: The error for this set of metrics and thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.error()
"""
return H2OBinomialModelMetrics._accuracy_to_error(self.metric("accuracy", thresholds=thresholds))
[docs] def precision(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The precision for this set of metrics and thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.precision()
"""
return self.metric("precision", thresholds=thresholds)
[docs] def tpr(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The True Postive Rate.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.tpr()
"""
return self.metric("tpr", thresholds=thresholds)
[docs] def tnr(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The True Negative Rate.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.tnr()
"""
return self.metric("tnr", thresholds=thresholds)
[docs] def fnr(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The False Negative Rate.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.fnr()
"""
return self.metric("fnr", thresholds=thresholds)
[docs] def fpr(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The False Positive Rate.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.fpr()
"""
return self.metric("fpr", thresholds=thresholds)
[docs] def recall(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: Recall for this set of metrics and thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.recall()
"""
return self.metric("recall", thresholds=thresholds)
[docs] def sensitivity(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: Sensitivity or True Positive Rate for this set of metrics and thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.sensitivity()
"""
return self.metric("sensitivity", thresholds=thresholds)
[docs] def fallout(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The fallout (same as False Positive Rate) for this set of metrics and thresholds.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.fallout()
"""
return self.metric("fallout", thresholds=thresholds)
[docs] def missrate(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The miss rate (same as False Negative Rate).
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.missrate()
"""
return self.metric("missrate", thresholds=thresholds)
[docs] def specificity(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The specificity (same as True Negative Rate).
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.specificity()
"""
return self.metric("specificity", thresholds=thresholds)
[docs] def mcc(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
:returns: The absolute MCC (a value between 0 and 1, 0 being totally dissimilar, 1 being identical).
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.mcc()
"""
return self.metric("absolute_mcc", thresholds=thresholds)
[docs] def max_per_class_error(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold minimizing the error will be used.
:returns: Return 1 - min(per class accuracy).
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.max_per_class_error()
"""
return H2OBinomialModelMetrics._accuracy_to_error(self.metric("min_per_class_accuracy", thresholds=thresholds))
[docs] def mean_per_class_error(self, thresholds=None):
"""
:param thresholds: thresholds parameter must be a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold minimizing the error will be used.
:returns: mean per class error.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.mean_per_class_error()
"""
return H2OBinomialModelMetrics._accuracy_to_error(self.metric("mean_per_class_accuracy", thresholds=thresholds))
@staticmethod
def _accuracy_to_error(accuracies):
errors = List()
errors.extend([acc[0], 1 - acc[1]] for acc in accuracies)
setattr(errors, 'value',
[1 - v for v in accuracies.value] if isinstance(accuracies.value, list)
else 1 - accuracies.value
)
return errors
[docs] def metric(self, metric, thresholds=None):
"""
:param str metric: A metric among :const:`maximizing_metrics`.
:param thresholds: thresholds parameter must be a number or a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
If 'all', then all stored thresholds are used and returned with the matching metric.
:returns: The set of metrics for the list of thresholds.
The returned list has a 'value' property holding only
the metric value (if no threshold provided or if provided as a number),
or all the metric values (if thresholds provided as a list)
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> local_data = [[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],
... [1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],
... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],
... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b']]
>>> h2o_data = h2o.H2OFrame(local_data)
>>> h2o_data.set_names(['response', 'predictor'])
>>> h2o_data["response"] = h2o_data["response"].asfactor()
>>> gbm = H2OGradientBoostingEstimator(ntrees=1,
... distribution="bernoulli")
>>> gbm.train(x=list(range(1,h2o_data.ncol)),
... y="response",
... training_frame=h2o_data)
>>> perf = gbm.model_performance()
>>> perf.metric("tps", [perf.find_threshold_by_max_metric("f1")])[0][1]
"""
assert_is_type(thresholds, None, 'all', numeric, [numeric])
if metric not in H2OBinomialModelMetrics.maximizing_metrics:
raise ValueError("The only allowable metrics are {}".format(', '.join(H2OBinomialModelMetrics.maximizing_metrics)))
h2o_metric = (H2OBinomialModelMetrics.metrics_aliases[metric] if metric in H2OBinomialModelMetrics.metrics_aliases
else metric)
value_is_scalar = is_type(metric, str) and (thresholds is None or is_type(thresholds, numeric))
if thresholds is None:
thresholds = [self.find_threshold_by_max_metric(h2o_metric)]
elif thresholds == 'all':
thresholds = None
elif is_type(thresholds, numeric):
thresholds = [thresholds]
metrics = List()
thresh2d = self._metric_json['thresholds_and_metric_scores']
if thresholds is None: # fast path to return all thresholds: skipping find_idx logic
metrics.extend(list(t) for t in zip(thresh2d['threshold'], thresh2d[h2o_metric]))
else:
for t in thresholds:
idx = self.find_idx_by_threshold(t)
metrics.append([t, thresh2d[h2o_metric][idx]])
setattr(metrics, 'value',
metrics[0][1] if value_is_scalar
else list(r[1] for r in metrics)
)
return metrics
[docs] @deprecated_params({'save_to_file': 'save_plot_path'})
def plot(self, type="roc", server=False, save_plot_path=None, plot=True, **kwargs):
"""
Produce the desired metric plot.
:param type: the type of metric plot (currently, only ROC curve ('roc') and Precision Recall curve ('pr') are supported).
:param server: if True, generate plot inline using matplotlib's "Agg" backend.
:param save_plot_path: filename to save the plot to
:param plot: True to plot curve, False to get a tuple of values at axis x and y of the plot
(tprs and fprs for AUC, recall and precision for PR)
:returns: None or values of x and y axis of the plot + the resulting plot (can be accessed using result.figure())
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> response = "economy_20mpg"
>>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
>>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234)
>>> cars_gbm.train(x = predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> cars_gbm.plot(type="roc")
>>> cars_gbm.plot(type="pr")
"""
if type == "roc":
return self._plot_roc(server, save_plot_path, plot)
elif type == "pr":
return self._plot_pr(server, save_plot_path, plot)
def _plot_roc(self, server=False, save_to_file=None, plot=True):
if plot:
plt = get_matplotlib_pyplot(server)
if plt is None:
return decorate_plot_result(figure=RAISE_ON_FIGURE_ACCESS)
fig = plt.figure()
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic Curve')
plt.text(0.5, 0.5, r'AUC={0:.4f}'.format(self._metric_json["AUC"]))
plt.plot(self.fprs, self.tprs, 'b--')
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.tight_layout()
if not server:
plt.show()
if save_to_file is not None: # only save when a figure is actually plotted
fig.savefig(fname=save_to_file)
return decorate_plot_result(res=(self.fprs, self.tprs), figure=fig)
else:
return decorate_plot_result(res=(self.fprs, self.tprs))
def _plot_pr(self, server=False, save_to_file=None, plot=True):
recalls = [x[0] for x in self.recall(thresholds='all')]
precisions = self.tprs
assert len(precisions) == len(recalls), "Precision and recall arrays must have the same length"
if plot:
plt = get_matplotlib_pyplot(server)
if plt is None:
return decorate_plot_result(figure=RAISE_ON_FIGURE_ACCESS)
fig = plt.figure()
plt.xlabel('Recall (TP/(TP+FP))')
plt.ylabel('Precision (TPR)')
plt.title('Precision Recall Curve')
plt.text(0.75, 0.95, r'auc_pr={0:.4f}'.format(self._metric_json["pr_auc"]))
plt.plot(recalls, precisions, 'b--')
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.tight_layout()
if not server:
plt.show()
if save_to_file is not None: # only save when a figure is actually plotted
plt.savefig(fname=save_to_file)
return decorate_plot_result(res=(recalls, precisions), figure=fig)
else:
return decorate_plot_result(res=(recalls, precisions))
@property
def fprs(self):
"""
Return all false positive rates for all threshold values.
:returns: a list of false positive rates.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> r = cars[0].runif()
>>> train = cars[r > .2]
>>> valid = cars[r <= .2]
>>> response_col = "economy_20mpg"
>>> distribution = "bernoulli"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3, distribution=distribution, fold_assignment="Random")
>>> gbm.train(y=response_col, x=predictors, validation_frame=valid, training_frame=train)
>>> (fprs, tprs) = gbm.roc(train=True, valid=False, xval=False)
>>> fprs
"""
return self._metric_json["thresholds_and_metric_scores"]["fpr"]
@property
def tprs(self):
"""
Return all true positive rates for all threshold values.
:returns: a list of true positive rates.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> r = cars[0].runif()
>>> train = cars[r > .2]
>>> valid = cars[r <= .2]
>>> response_col = "economy_20mpg"
>>> distribution = "bernoulli"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3, distribution=distribution, fold_assignment="Random")
>>> gbm.train(y=response_col, x=predictors, validation_frame=valid, training_frame=train)
>>> (fprs, tprs) = gbm.roc(train=True, valid=False, xval=False)
>>> tprs
"""
return self._metric_json["thresholds_and_metric_scores"]["tpr"]
[docs] def roc(self):
"""
Return the coordinates of the ROC curve as a tuple containing the false positive rates as a list and true positive rates as a list.
:returns: The ROC values.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
>>> r = cars[0].runif()
>>> train = cars[r > .2]
>>> valid = cars[r <= .2]
>>> response_col = "economy_20mpg"
>>> distribution = "bernoulli"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution=distribution,
... fold_assignment="Random")
>>> gbm.train(x=predictors,
... y=response_col,
... validation_frame=valid,
... training_frame=train)
>>> gbm.roc(train=True, valid=False, xval=False)
"""
return self.fprs, self.tprs
metrics_aliases = dict(
fallout='fpr',
missrate='fnr',
recall='tpr',
sensitivity='tpr',
specificity='tnr'
)
#: metrics names allowed for confusion matrix
maximizing_metrics = ('absolute_mcc', 'accuracy', 'precision',
'f0point5', 'f1', 'f2',
'mean_per_class_accuracy', 'min_per_class_accuracy',
'tns', 'fns', 'fps', 'tps',
'tnr', 'fnr', 'fpr', 'tpr') + tuple(metrics_aliases.keys())
[docs] def confusion_matrix(self, metrics=None, thresholds=None):
"""
Get the confusion matrix for the specified metric
:param metrics: A string (or list of strings) among metrics listed in :const:`maximizing_metrics`. Defaults to 'f1'.
:param thresholds: A value (or list of values) between 0 and 1.
If None, then the thresholds maximizing each provided metric will be used.
:returns: a list of ConfusionMatrix objects (if there are more than one to return), a single ConfusionMatrix
(if there is only one) or None if thresholds are metrics scores are missing.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["cylinders"] = cars["cylinders"].asfactor()
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> response = "cylinders"
>>> distribution = "multinomial"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution=distribution)
>>> gbm.train(x=predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> gbm.confusion_matrix(train)
"""
thresh2d = self._metric_json['thresholds_and_metric_scores']
if thresh2d is None:
return None
# make lists out of metrics and thresholds arguments
if metrics is None and thresholds is None:
metrics = ['f1']
if isinstance(metrics, list):
metrics_list = metrics
elif metrics is None:
metrics_list = []
else:
metrics_list = [metrics]
if isinstance(thresholds, list):
thresholds_list = thresholds
elif thresholds is None:
thresholds_list = []
else:
thresholds_list = [thresholds]
# error check the metrics_list and thresholds_list
assert_is_type(thresholds_list, [numeric])
assert_satisfies(thresholds_list, all(0 <= t <= 1 for t in thresholds_list))
if not all(m.lower() in H2OBinomialModelMetrics.maximizing_metrics for m in metrics_list):
raise ValueError("The only allowable metrics are {}".format(', '.join(H2OBinomialModelMetrics.maximizing_metrics)))
# make one big list that combines the thresholds and metric-thresholds
metrics_thresholds = [self.find_threshold_by_max_metric(m) for m in metrics_list]
for mt in metrics_thresholds:
thresholds_list.append(mt)
first_metrics_thresholds_offset = len(thresholds_list) - len(metrics_thresholds)
actual_thresholds = [float(e[0]) for i, e in enumerate(thresh2d.cell_values)]
cms = []
for i, t in enumerate(thresholds_list):
idx = self.find_idx_by_threshold(t)
row = thresh2d.cell_values[idx]
tns = row[11]
fns = row[12]
fps = row[13]
tps = row[14]
p = tps + fns
n = tns + fps
c0 = n - fps
c1 = p - tps
if t in metrics_thresholds:
m = metrics_list[i - first_metrics_thresholds_offset]
table_header = "Confusion Matrix (Act/Pred) for max {} @ threshold = {}".format(m, actual_thresholds[idx])
else:
table_header = "Confusion Matrix (Act/Pred) @ threshold = {}".format(actual_thresholds[idx])
cms.append(ConfusionMatrix(cm=[[c0, fps], [c1, tps]], domains=self._metric_json['domain'],
table_header=table_header))
if len(cms) == 1:
return cms[0]
else:
return cms
[docs] def find_threshold_by_max_metric(self, metric):
"""
:param metrics: A string among the metrics listed in :const:`maximizing_metrics`.
:returns: the threshold at which the given metric is maximal.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> local_data = [[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],
... [1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],
... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],
... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b']]
>>> h2o_data = h2o.H2OFrame(local_data)
>>> h2o_data.set_names(['response', 'predictor'])
>>> h2o_data["response"] = h2o_data["response"].asfactor()
>>> gbm = H2OGradientBoostingEstimator(ntrees=1,
... distribution="bernoulli")
>>> gbm.train(x=list(range(1,h2o_data.ncol)),
... y="response",
... training_frame=h2o_data)
>>> perf = gbm.model_performance()
>>> perf.find_threshold_by_max_metric("f1")
"""
crit2d = self._metric_json['max_criteria_and_metric_scores']
# print(crit2d)
h2o_metric = (H2OBinomialModelMetrics.metrics_aliases[metric] if metric in H2OBinomialModelMetrics.metrics_aliases
else metric)
for e in crit2d.cell_values:
if e[0] == "max " + h2o_metric.lower():
return e[1]
raise ValueError("No metric " + str(metric.lower()))
[docs] def find_idx_by_threshold(self, threshold):
"""
Retrieve the index in this metric's threshold list at which the given threshold is located.
:param threshold: Find the index of this input threshold.
:returns: the index
:raises ValueError: if no such index can be found.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> local_data = [[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],
... [1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],
... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],
... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b']]
>>> h2o_data = h2o.H2OFrame(local_data)
>>> h2o_data.set_names(['response', 'predictor'])
>>> h2o_data["response"] = h2o_data["response"].asfactor()
>>> gbm = H2OGradientBoostingEstimator(ntrees=1,
... distribution="bernoulli")
>>> gbm.train(x=list(range(1,h2o_data.ncol)),
... y="response",
... training_frame=h2o_data)
>>> perf = gbm.model_performance()
>>> perf.find_idx_by_threshold(0.45)
"""
assert_is_type(threshold, numeric)
thresh2d = self._metric_json['thresholds_and_metric_scores']
# print(thresh2d)
for i, e in enumerate(thresh2d.cell_values):
t = float(e[0])
if abs(t - threshold) < 1e-8 * max(t, threshold):
return i
if 0 <= threshold <= 1:
thresholds = [float(e[0]) for i, e in enumerate(thresh2d.cell_values)]
threshold_diffs = [abs(t - threshold) for t in thresholds]
closest_idx = threshold_diffs.index(min(threshold_diffs))
closest_threshold = thresholds[closest_idx]
print("Could not find exact threshold {0}; using closest threshold found {1}."
.format(threshold, closest_threshold))
return closest_idx
raise ValueError("Threshold must be between 0 and 1, but got {0} ".format(threshold))
[docs] def gains_lift(self):
"""Retrieve the Gains/Lift table.
:examples:
>>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> cars["cylinders"] = cars["cylinders"].asfactor()
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> response_col = "cylinders"
>>> distribution = "multinomial"
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> gbm = H2OGradientBoostingEstimator(nfolds=3,
... distribution=distribution)
>>> gbm.train(x=predictors,
... y = response,
... training_frame = train,
... validation_frame = valid)
>>> gbm.gains_lift()
"""
if 'gains_lift_table' in self._metric_json:
return self._metric_json['gains_lift_table']
return None
[docs]class H2OBinomialUpliftModelMetrics(MetricsBase):
"""
This class is available only for Uplift DRF model
This class is essentially an API for the AUUC object
"""
def __init__(self, metric_json, on=None, algo=""):
"""
Create a new Binomial Metrics object (essentially a wrapper around some json)
:param metric_json: A blob of json holding all of the needed information
:param on: Metrics built on "training_data" or "validation_data" (default is "training_data")
"""
super(H2OBinomialUpliftModelMetrics, self).__init__(metric_json, on, algo)
[docs] def auuc(self, metric=None):
"""
Retrieve area under uplift curve (AUUC) value.
:param metric AUUC metric type (None, "qini", "lift", "gain",
default is None which means it takes default metric from model parameters)
:returns: AUUC value.
:examples:
>>> from h2o.estimators import H2OUpliftRandomForestEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv")
>>> treatment_column = "treatment"
>>> response_column = "conversion"
>>> train[treatment_column] = train[treatment_column].asfactor()
>>> train[response_column] = train[response_column].asfactor()
>>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"]
>>>
>>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10,
... max_depth=5,
... treatment_column=treatment_column,
... uplift_metric="qini",
... distribution="bernoulli",
... gainslift_bins=10,
... min_rows=10,
... auuc_type="gain")
>>> uplift_model.train(y=response_column, x=predictors, training_frame=train)
>>> uplift_model.auuc()
"""
if metric is None:
return self._metric_json['AUUC']
else:
assert metric in ['qini', 'lift', 'gain'], \
"AUUC metric "+metric+" should be 'qini','lift' or 'gain'."
return self._metric_json['auuc_table'][metric][0]
[docs] def uplift(self, metric="AUTO"):
"""
Retrieve uplift values for each bin.
:param metric AUUC metric type ("qini", "lift", "gain", default is "AUTO" which means "qini")
:returns: a list of uplift values.
:examples:
>>> from h2o.estimators import H2OUpliftRandomForestEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv")
>>> treatment_column = "treatment"
>>> response_column = "conversion"
>>> train[treatment_column] = train[treatment_column].asfactor()
>>> train[response_column] = train[response_column].asfactor()
>>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"]
>>>
>>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10,
... max_depth=5,
... treatment_column=treatment_column,
... uplift_metric="qini",
... distribution="bernoulli",
... gainslift_bins=10,
... min_rows=10,
... auuc_type="gain")
>>> uplift_model.train(y=response_column, x=predictors, training_frame=train)
>>> uplift_model.uplift()
"""
assert metric in ['AUTO', 'qini', 'lift', 'gain']
if metric == "AUTO":
metric = 'qini'
return self._metric_json["thresholds_and_metric_scores"][metric]
[docs] def n(self):
"""
Retrieve numbers of observations in each bin.
:returns: a list of numbers of observation.
:examples:
>>> from h2o.estimators import H2OUpliftRandomForestEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv")
>>> treatment_column = "treatment"
>>> response_column = "conversion"
>>> train[treatment_column] = train[treatment_column].asfactor()
>>> train[response_column] = train[response_column].asfactor()
>>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"]
>>>
>>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10,
... max_depth=5,
... treatment_column=treatment_column,
... uplift_metric="qini",
... distribution="bernoulli",
... gainslift_bins=10,
... min_rows=10,
... auuc_type="gain")
>>> uplift_model.train(y=response_column, x=predictors, training_frame=train)
>>> uplift_model.n()
"""
return self._metric_json["thresholds_and_metric_scores"]["n"]
[docs] def thresholds(self):
"""
Retrieve prediction thresholds for each bin.
:returns: a list of thresholds.
:examples:
>>> from h2o.estimators import H2OUpliftRandomForestEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv")
>>> treatment_column = "treatment"
>>> response_column = "conversion"
>>> train[treatment_column] = train[treatment_column].asfactor()
>>> train[response_column] = train[response_column].asfactor()
>>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"]
>>>
>>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10,
... max_depth=5,
... treatment_column=treatment_column,
... uplift_metric="qini",
... distribution="bernoulli",
... gainslift_bins=10,
... min_rows=10,
... auuc_type="gain")
>>> uplift_model.train(y=response_column, x=predictors, training_frame=train)
>>> uplift_model.thresholds()
"""
return self._metric_json["thresholds_and_metric_scores"]["thresholds"]
[docs] def thresholds_and_metric_scores(self):
"""
Retrieve thresholds and metric scores table.
:returns: a thresholds and metric scores table for the specified key(s).
:examples:
>>> from h2o.estimators import H2OUpliftRandomForestEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv")
>>> treatment_column = "treatment"
>>> response_column = "conversion"
>>> train[treatment_column] = train[treatment_column].asfactor()
>>> train[response_column] = train[response_column].asfactor()
>>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"]
>>>
>>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10,
... max_depth=5,
... treatment_column=treatment_column,
... uplift_metric="qini",
... distribution="bernoulli",
... gainslift_bins=10,
... min_rows=10,
... auuc_type="gain")
>>> uplift_model.train(y=response_column, x=predictors, training_frame=train)
>>> uplift_model.thresholds_and_metric_scores()
"""
return self._metric_json["thresholds_and_metric_scores"]
[docs] def auuc_table(self):
"""
Retrieve all types of AUUC in a table.
:returns: a table of AUUCs.
:examples:
>>> from h2o.estimators import H2OUpliftRandomForestEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv")
>>> treatment_column = "treatment"
>>> response_column = "conversion"
>>> train[treatment_column] = train[treatment_column].asfactor()
>>> train[response_column] = train[response_column].asfactor()
>>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"]
>>>
>>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10,
... max_depth=5,
... treatment_column=treatment_column,
... uplift_metric="qini",
... distribution="bernoulli",
... gainslift_bins=10,
... min_rows=10,
... auuc_type="gain")
>>> uplift_model.train(y=response_column, x=predictors, training_frame=train)
>>> uplift_model.auuc_table()
"""
return self._metric_json["auuc_table"]
[docs] def plot_uplift(self, server=False, save_to_file=None, plot=True, metric="auto"):
"""
Plot Uplift Curve.
:param server: if True, generate plot inline using matplotlib's "Agg" backend.
:param save_to_file filename to save the plot to
:param plot True to plot curve, False to get a tuple of values at axis x and y of the plot
(number of observations and uplift values)
:param metric AUUC metric type ("qini", "lift", "gain", default is "AUTO" which means "qini")
:examples:
>>> from h2o.estimators import H2OUpliftRandomForestEstimator
>>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv")
>>> treatment_column = "treatment"
>>> response_column = "conversion"
>>> train[treatment_column] = train[treatment_column].asfactor()
>>> train[response_column] = train[response_column].asfactor()
>>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"]
>>>
>>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10,
... max_depth=5,
... treatment_column=treatment_column,
... uplift_metric="qini",
... distribution="bernoulli",
... gainslift_bins=10,
... min_rows=10,
... auuc_type="gain")
>>> uplift_model.train(y=response_column, x=predictors, training_frame=train)
>>> uplift_model.plot_uplift(plot=True)
>>> n, uplift = uplift_model.plot_uplift(plot=False)
"""
if plot:
plt = get_matplotlib_pyplot(server)
if plt is None:
return
plt.ylabel('Cumulative '+metric)
plt.xlabel('Number Targeted')
plt.title('Cumulate Uplift Curve - '+metric+"\n"+r'AUUC={0:.4f}'.format(self.auuc(metric)))
uplift = self.uplift(metric)
n = self.n()
plt.plot(n, uplift, 'b-', label='uplift')
a = uplift[len(uplift)-1]/n[len(n)-1]
rnd = [a * nn for nn in n]
plt.plot(n, rnd, 'k--', label='random')
if metric == "lift":
plt.legend(loc='upper right')
else:
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
if not server:
plt.show()
if save_to_file is not None: # only save when a figure is actually plotted
plt.savefig(save_to_file)
else:
return self.n(), self.uplift(metric)
[docs]class H2OAutoEncoderModelMetrics(MetricsBase):
"""
:examples:
>>> from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
>>> train_ecg = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_train.csv")
>>> test_ecg = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_test.csv")
>>> anomaly_model = H2OAutoEncoderEstimator(activation="Tanh",
... hidden=[50,50,50],
... sparse=True, l1=1e-4,
... epochs=100)
>>> anomaly_model.train(x=train_ecg.names, training_frame=train_ecg)
>>> anomaly_model.mse()
"""
def __init__(self, metric_json, on=None, algo=""):
super(H2OAutoEncoderModelMetrics, self).__init__(metric_json, on, algo)
[docs]class H2ODimReductionModelMetrics(MetricsBase):
def __init__(self, metric_json, on=None, algo=""):
super(H2ODimReductionModelMetrics, self).__init__(metric_json, on, algo)
[docs] def num_err(self):
"""Sum of Squared Error over non-missing numeric entries, or None if not present."""
if MetricsBase._has(self._metric_json, "numerr"):
return self._metric_json["numerr"]
return None
[docs] def cat_err(self):
"""The Number of Misclassified categories over non-missing categorical entries, or None if not present."""
if MetricsBase._has(self._metric_json, "caterr"):
return self._metric_json["caterr"]
return None
[docs]class H2OWordEmbeddingModelMetrics(MetricsBase):
def __init__(self, metric_json, on=None, algo=""):
super(H2OWordEmbeddingModelMetrics, self).__init__(metric_json, on, algo)
[docs]class H2OAnomalyDetectionModelMetrics(MetricsBase):
def __init__(self, metric_json, on=None, algo=""):
super(H2OAnomalyDetectionModelMetrics, self).__init__(metric_json, on, algo)
[docs] def mean_score(self):
"""
Mean Anomaly Score. For Isolation Forest represents the average of all tree-path lengths.
:examples:
>>> from h2o.estimators.isolation_forest import H2OIsolationForestEstimator
>>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_train.csv")
>>> test = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_test.csv")
>>> isofor_model = H2OIsolationForestEstimator(sample_size=5, ntrees=7)
>>> isofor_model.train(training_frame = train)
>>> perf = isofor_model.model_performance()
>>> perf.mean_score()
"""
if MetricsBase._has(self._metric_json, "mean_score"):
return self._metric_json["mean_score"]
return None
[docs] def mean_normalized_score(self):
"""
Mean Normalized Anomaly Score. For Isolation Forest - normalized average path length.
:examples:
>>> from h2o.estimators.isolation_forest import H2OIsolationForestEstimator
>>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_train.csv")
>>> test = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_test.csv")
>>> isofor_model = H2OIsolationForestEstimator(sample_size=5, ntrees=7)
>>> isofor_model.train(training_frame = train)
>>> perf = isofor_model.model_performance()
>>> perf.mean_normalized_score()
"""
if MetricsBase._has(self._metric_json, "mean_normalized_score"):
return self._metric_json["mean_normalized_score"]
return None
[docs]class H2OModelMetricsRegressionCoxPH(MetricsBase):
"""
:examples:
>>> from h2o.estimators.coxph import H2OCoxProportionalHazardsEstimator
>>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
>>> coxph = H2OCoxProportionalHazardsEstimator(start_column="start",
... stop_column="stop",
... ties="breslow")
>>> coxph.train(x="age", y="event", training_frame=heart)
>>> coxph
"""
[docs] def concordance(self):
"""Concordance metrics (c-index).
Proportion of concordant pairs divided by the total number of possible evaluation pairs.
1.0 for perfect match, 0.5 for random results."""
if MetricsBase._has(self._metric_json, "concordance"):
return self._metric_json["concordance"]
return None
[docs] def concordant(self):
"""Count of concordant pairs."""
if MetricsBase._has(self._metric_json, "concordant"):
return self._metric_json["concordant"]
return None
[docs] def tied_y(self):
"""Count of tied pairs."""
if MetricsBase._has(self._metric_json, "tied_y"):
return self._metric_json["tied_y"]
return None
def __init__(self, metric_json, on=None, algo=""):
super(H2OModelMetricsRegressionCoxPH, self).__init__(metric_json, on, algo)
[docs]class H2OTargetEncoderMetrics(MetricsBase):
def __init__(self, metric_json, on=None, algo=""):
super(H2OTargetEncoderMetrics, self).__init__(metric_json, on, algo)