Source code for h2o.model.metrics_base

# -*- encoding: utf-8 -*-
"""
Regression model.

:copyright: (c) 2016 H2O.ai
:license:   Apache License Version 2.0 (see LICENSE for details)
"""
from __future__ import absolute_import, division, print_function, unicode_literals

from h2o.model.confusion_matrix import ConfusionMatrix
from h2o.plot import decorate_plot_result, get_matplotlib_pyplot, RAISE_ON_FIGURE_ACCESS
from h2o.utils.compatibility import *  # NOQA
from h2o.utils.metaclass import backwards_compatibility, deprecated_fn, h2o_meta, deprecated_params
from h2o.utils.typechecks import assert_is_type, assert_satisfies, is_type, numeric


[docs]@backwards_compatibility( instance_attrs=dict( giniCoef=lambda self, *args, **kwargs: self.gini(*args, **kwargs) ) ) class MetricsBase(h2o_meta()): """ A parent class to house common metrics available for the various Metrics types. The methods here are available across different model categories. """ def __init__(self, metric_json, on=None, algo=""): # Yep, it's messed up... if isinstance(metric_json, MetricsBase): metric_json = metric_json._metric_json self._metric_json = metric_json # train and valid and xval are not mutually exclusive -- could have a test. train and # valid only make sense at model build time. self._on_train = False self._on_valid = False self._on_xval = False self._algo = algo if on == "training_metrics": self._on_train = True elif on == "validation_metrics": self._on_valid = True elif on == "cross_validation_metrics": self._on_xval = True elif on is None: pass else: raise ValueError("on expected to be train,valid,or xval. Got: " + str(on))
[docs] @classmethod def make(cls, kvs): """Factory method to instantiate a MetricsBase object from the list of key-value pairs.""" return cls(metric_json=dict(kvs))
def __repr__(self): # FIXME !!! __repr__ should never print anything, but return a string self.show() return "" # TODO: convert to actual fields list def __getitem__(self, key): return self._metric_json.get(key) @staticmethod def _has(dictionary, key): return key in dictionary and dictionary[key] is not None
[docs] def show(self): """Display a short summary of the metrics. :examples: >>> from from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.show() """ if self._metric_json==None: print("WARNING: Model metrics cannot be calculated and metric_json is empty due to the absence of the response column in your dataset.") return metric_type = self._metric_json['__meta']['schema_type'] types_w_glm = ['ModelMetricsRegressionGLM', 'ModelMetricsRegressionGLMGeneric', 'ModelMetricsBinomialGLM', 'ModelMetricsBinomialGLMGeneric', 'ModelMetricsHGLMGaussianGaussian', 'ModelMetricsHGLMGaussianGaussianGeneric'] types_w_clustering = ['ModelMetricsClustering'] types_w_mult = ['ModelMetricsMultinomial', 'ModelMetricsMultinomialGeneric'] types_w_ord = ['ModelMetricsOrdinal', 'ModelMetricsOrdinalGeneric'] types_w_bin = ['ModelMetricsBinomial', 'ModelMetricsBinomialGeneric', 'ModelMetricsBinomialGLM', 'ModelMetricsBinomialGLMGeneric'] types_w_r2 = ['ModelMetricsRegressionGLM', 'ModelMetricsRegressionGLMGeneric'] types_w_mean_residual_deviance = ['ModelMetricsRegressionGLM', 'ModelMetricsRegressionGLMGeneric', 'ModelMetricsRegression', 'ModelMetricsRegressionGeneric'] types_w_mean_absolute_error = ['ModelMetricsRegressionGLM', 'ModelMetricsRegressionGLMGeneric', 'ModelMetricsRegression', 'ModelMetricsRegressionGeneric'] types_w_mean_per_class_error = ['ModelMetricsBinomial', 'ModelMetricsBinomialGeneric', 'ModelMetricsOrdinal', 'ModelMetricsOrdinalGeneric'] + types_w_mult types_w_logloss = types_w_bin + types_w_mult+types_w_ord types_w_dim = ["ModelMetricsGLRM"] types_w_anomaly = ['ModelMetricsAnomaly'] types_w_cox = ['ModelMetricsRegressionCoxPH'] types_w_uplift = ['ModelMetricsBinomialUplift'] print() print(metric_type + ": " + self._algo) reported_on = "** Reported on {} data. **" if self._on_train: print(reported_on.format("train")) elif self._on_valid: print(reported_on.format("validation")) elif self._on_xval: print(reported_on.format("cross-validation")) else: print(reported_on.format("test")) print() if metric_type not in types_w_anomaly and metric_type not in types_w_uplift and \ metric_type not in types_w_clustering: print("MSE: " + str(self.mse())) print("RMSE: " + str(self.rmse())) if metric_type in types_w_mean_absolute_error: print("MAE: " + str(self.mae())) print("RMSLE: " + str(self.rmsle())) if metric_type in types_w_r2: print("R^2: " + str(self.r2())) if metric_type in types_w_mean_residual_deviance: print("Mean Residual Deviance: " + str(self.mean_residual_deviance())) if metric_type in types_w_logloss: print("LogLoss: " + str(self.logloss())) if metric_type in types_w_mean_per_class_error: print("Mean Per-Class Error: %s" % self._mean_per_class_error()) if metric_type in types_w_glm: if metric_type == 'ModelMetricsHGLMGaussianGaussian': # print something for HGLM print("Standard error of fixed columns: "+str(self.hglm_metric("sefe"))) print("Standard error of random columns: "+str(self.hglm_metric("sere"))) print("Coefficients for fixed columns: "+str(self.hglm_metric("fixedf"))) print("Coefficients for random columns: "+str(self.hglm_metric("ranef"))) print("Random column indices: "+str(self.hglm_metric("randc"))) print("Dispersion parameter of the mean model (residual variance for LMM): "+str(self.hglm_metric("varfix"))) print("Dispersion parameter of the random columns (variance of random columns): "+str(self.hglm_metric("varranef"))) print("Convergence reached for algorithm: "+str(self.hglm_metric("converge"))) print("Deviance degrees of freedom for mean part of the model: "+str(self.hglm_metric("dfrefe"))) print("Estimates and standard errors of the linear prediction in the dispersion model: "+str(self.hglm_metric("summvc1"))) print("Estimates and standard errors of the linear predictor for the dispersion parameter of the random columns: "+str(self.hglm_metric("summvc2"))) print("Index of most influential observation (-1 if none): "+str(self.hglm_metric("bad"))) print("H-likelihood: "+str(self.hglm_metric("hlik"))) print("Profile log-likelihood profiled over random columns: "+str(self.hglm_metric("pvh"))) print("Adjusted profile log-likelihood profiled over fixed and random effects: "+str(self.hglm_metric("pbvh"))) print("Conditional AIC: "+str(self.hglm_metric("caic"))) else: print("Null degrees of freedom: " + str(self.null_degrees_of_freedom())) print("Residual degrees of freedom: " + str(self.residual_degrees_of_freedom())) print("Null deviance: " + str(self.null_deviance())) print("Residual deviance: " + str(self.residual_deviance())) print("AIC: " + str(self.aic())) if metric_type in types_w_bin: print("AUC: " + str(self.auc())) print("AUCPR: " + str(self.aucpr())) print("Gini: " + str(self.gini())) if self.confusion_matrix(): self.confusion_matrix().show() if self._metric_json["max_criteria_and_metric_scores"]: self._metric_json["max_criteria_and_metric_scores"].show() if self.gains_lift(): print(self.gains_lift()) if metric_type in types_w_mult: print("AUC: " + str(self.auc())) print("AUCPR: " + str(self.aucpr())) # AUC and PR AUC table cannot be computed due domain size if self._metric_json["multinomial_auc_table"] is not None: self._metric_json["multinomial_auc_table"].show() else: print("Multinomial auc values: Table is not computed because it is disabled (model parameter 'auc_type' is set to AUTO or NONE) or due to domain size (maximum is 50 domains).") if self._metric_json["multinomial_aucpr_table"] is not None: self._metric_json["multinomial_aucpr_table"].show() else: print("Multinomial auc_pr values: Table is not computed because it is disabled (model parameter 'auc_type' is set to AUTO or NONE) or due to domain size (maximum is 50 domains).") if metric_type in types_w_anomaly: print("Anomaly Score: " + str(self.mean_score())) print("Normalized Anomaly Score: " + str(self.mean_normalized_score())) if (metric_type in types_w_mult) or (metric_type in types_w_ord): self.confusion_matrix().show() self.hit_ratio_table().show() if metric_type in types_w_clustering: print("Total Within Cluster Sum of Square Error: " + str(self.tot_withinss())) print("Total Sum of Square Error to Grand Mean: " + str(self.totss())) print("Between Cluster Sum of Square Error: " + str(self.betweenss())) if self._metric_json['centroid_stats'] is not None: self._metric_json['centroid_stats'].show() else: print("Centroid stats are not available.") if metric_type in types_w_dim: print("Sum of Squared Error (Numeric): " + str(self.num_err())) print("Misclassification Error (Categorical): " + str(self.cat_err())) if metric_type in types_w_cox: print("Concordance score: " + str(self.concordance())) print("Concordant count: " + str(self.concordant())) print("Tied cout: " + str(self.tied_y())) if metric_type in types_w_uplift: print("AUUC: " + str(self.auuc())) print("AUUC normalized: "+ str(self.auuc_normalized())) if self._metric_json["auuc_table"] is not None: self._metric_json["auuc_table"].show() print() print("Qini value: " + str(self.qini())) if self._metric_json["aecu_table"] is not None: self._metric_json["aecu_table"].show() if self.custom_metric_name(): print("{}: {}".format(self.custom_metric_name(), self.custom_metric_value()))
[docs] def r2(self): """The R squared coefficient. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.r2() """ return self._metric_json["r2"]
[docs] def logloss(self): """Log loss. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.logloss() """ return self._metric_json["logloss"]
[docs] def nobs(self): """ The number of observations. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> perf = cars_gbm.model_performance() >>> perf.nobs() """ return self._metric_json["nobs"]
[docs] def mean_residual_deviance(self): """The mean residual deviance for this set of metrics. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip") >>> air_gbm = H2OGradientBoostingEstimator() >>> air_gbm.train(x=list(range(9)), ... y=9, ... training_frame=airlines, ... validation_frame=airlines) >>> air_gbm.mean_residual_deviance(train=True,valid=False,xval=False) """ return self._metric_json["mean_residual_deviance"]
[docs] def auc(self): """The AUC for this set of metrics. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.auc() """ return self._metric_json['AUC']
[docs] def aucpr(self): """The area under the precision recall curve. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.aucpr() """ return self._metric_json['pr_auc']
[docs] @deprecated_fn(replaced_by=aucpr) def pr_auc(self): pass
[docs] def aic(self): """The AIC for this set of metrics. :examples: >>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate[2] = prostate[2].asfactor() >>> prostate[4] = prostate[4].asfactor() >>> prostate[5] = prostate[5].asfactor() >>> prostate[8] = prostate[8].asfactor() >>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] >>> response = "CAPSULE" >>> train, valid = prostate.split_frame(ratios=[.8],seed=1234) >>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial") >>> pros_glm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> pros_glm.aic() """ return self._metric_json['AIC']
[docs] def gini(self): """Gini coefficient. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.gini() """ return self._metric_json['Gini']
[docs] def mse(self): """The MSE for this set of metrics. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.mse() """ return self._metric_json['MSE']
[docs] def rmse(self): """The RMSE for this set of metrics. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.rmse() """ return self._metric_json['RMSE']
[docs] def mae(self): """The MAE for this set of metrics. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "cylinders" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(distribution = "poisson", ... seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.mae() """ return self._metric_json['mae']
[docs] def rmsle(self): """The RMSLE for this set of metrics. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "cylinders" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(distribution = "poisson", ... seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.rmsle() """ return self._metric_json['rmsle']
[docs] def residual_deviance(self): """The residual deviance if the model has it, otherwise None. :examples: >>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate[2] = prostate[2].asfactor() >>> prostate[4] = prostate[4].asfactor() >>> prostate[5] = prostate[5].asfactor() >>> prostate[8] = prostate[8].asfactor() >>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] >>> response = "CAPSULE" >>> train, valid = prostate.split_frame(ratios=[.8],seed=1234) >>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial") >>> pros_glm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> pros_glm.residual_deviance() """ if MetricsBase._has(self._metric_json, "residual_deviance"): return self._metric_json["residual_deviance"] return None
[docs] def hglm_metric(self, metric_string): if MetricsBase._has(self._metric_json, metric_string): return self._metric_json[metric_string] return None
[docs] def residual_degrees_of_freedom(self): """The residual DoF if the model has residual deviance, otherwise None. :examples: >>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate[2] = prostate[2].asfactor() >>> prostate[4] = prostate[4].asfactor() >>> prostate[5] = prostate[5].asfactor() >>> prostate[8] = prostate[8].asfactor() >>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] >>> response = "CAPSULE" >>> train, valid = prostate.split_frame(ratios=[.8],seed=1234) >>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial") >>> pros_glm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> pros_glm.residual_degrees_of_freedom() """ if MetricsBase._has(self._metric_json, "residual_degrees_of_freedom"): return self._metric_json["residual_degrees_of_freedom"] return None
[docs] def null_deviance(self): """The null deviance if the model has residual deviance, otherwise None. :examples: >>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate[2] = prostate[2].asfactor() >>> prostate[4] = prostate[4].asfactor() >>> prostate[5] = prostate[5].asfactor() >>> prostate[8] = prostate[8].asfactor() >>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] >>> response = "CAPSULE" >>> train, valid = prostate.split_frame(ratios=[.8],seed=1234) >>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial") >>> pros_glm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> pros_glm.null_deviance() """ if MetricsBase._has(self._metric_json, "null_deviance"): return self._metric_json["null_deviance"] return None
[docs] def null_degrees_of_freedom(self): """The null DoF if the model has residual deviance, otherwise None. :examples: >>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate[2] = prostate[2].asfactor() >>> prostate[4] = prostate[4].asfactor() >>> prostate[5] = prostate[5].asfactor() >>> prostate[8] = prostate[8].asfactor() >>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] >>> response = "CAPSULE" >>> train, valid = prostate.split_frame(ratios=[.8],seed=1234) >>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial") >>> pros_glm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> pros_glm.null_degrees_of_freedom() """ if MetricsBase._has(self._metric_json, "null_degrees_of_freedom"): return self._metric_json["null_degrees_of_freedom"] return None
# private accessor for mean per-class error - the public version is overridden in H2OBinomialModelMetrics with # a method with different return semantics def _mean_per_class_error(self): return self._metric_json['mean_per_class_error']
[docs] def mean_per_class_error(self): """The mean per class error. :examples: >>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate[2] = prostate[2].asfactor() >>> prostate[4] = prostate[4].asfactor() >>> prostate[5] = prostate[5].asfactor() >>> prostate[8] = prostate[8].asfactor() >>> predictors = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] >>> response = "CAPSULE" >>> train, valid = prostate.split_frame(ratios=[.8],seed=1234) >>> pros_glm = H2OGeneralizedLinearEstimator(family="binomial") >>> pros_glm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> pros_glm.mean_per_class_error() """ return self._mean_per_class_error()
[docs] def custom_metric_name(self): """Name of custom metric or None.""" if MetricsBase._has(self._metric_json, "custom_metric_name"): return self._metric_json['custom_metric_name'] else: return None
[docs] def custom_metric_value(self): """Value of custom metric or None.""" if MetricsBase._has(self._metric_json, "custom_metric_value"): return self._metric_json['custom_metric_value'] else: return None
[docs]class H2ORegressionModelMetrics(MetricsBase): """ This class provides an API for inspecting the metrics returned by a regression model. It is possible to retrieve the :math:`R^2` (1 - MSE/variance) and MSE. :examples: >>> from h2o.estimators.glm import H2OGeneralizedLinearEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "cylinders" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_glm = H2OGeneralizedLinearEstimator() >>> cars_glm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_glm.mse() """ def __init__(self, metric_json, on=None, algo=""): super(H2ORegressionModelMetrics, self).__init__(metric_json, on, algo)
[docs]class H2OClusteringModelMetrics(MetricsBase): def __init__(self, metric_json, on=None, algo=""): super(H2OClusteringModelMetrics, self).__init__(metric_json, on, algo)
[docs] def tot_withinss(self): """The Total Within Cluster Sum-of-Square Error, or None if not present. :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> km.tot_withinss() """ if MetricsBase._has(self._metric_json, "tot_withinss"): return self._metric_json["tot_withinss"] return None
[docs] def totss(self): """The Total Sum-of-Square Error to Grand Mean, or None if not present. :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> km.totss() """ if MetricsBase._has(self._metric_json, "totss"): return self._metric_json["totss"] return None
[docs] def betweenss(self): """The Between Cluster Sum-of-Square Error, or None if not present. :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> km.betweenss() """ if MetricsBase._has(self._metric_json, "betweenss"): return self._metric_json["betweenss"] return None
[docs]class H2OMultinomialModelMetrics(MetricsBase): def __init__(self, metric_json, on=None, algo=""): super(H2OMultinomialModelMetrics, self).__init__(metric_json, on, algo)
[docs] def confusion_matrix(self): """Returns a confusion matrix based on H2O's default prediction threshold for a dataset. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["cylinders"] = cars["cylinders"].asfactor() >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> response_col = "cylinders" >>> distribution = "multinomial" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution = distribution) >>> gbm.train(x=predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> gbm.confusion_matrix(train) """ return self._metric_json['cm']['table']
[docs] def hit_ratio_table(self): """Retrieve the Hit Ratios. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["cylinders"] = cars["cylinders"].asfactor() >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> response_col = "cylinders" >>> distribution = "multinomial" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution = distribution) >>> gbm.train(x=predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> gbm.hit_ratio_table() """ return self._metric_json['hit_ratio_table']
[docs] def multinomial_auc_table(self): """Retrieve the multinomial AUC values. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["cylinders"] = cars["cylinders"].asfactor() >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> response_col = "cylinders" >>> distribution = "multinomial" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution = distribution) >>> gbm.train(x=predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> gbm.multinomial_auc_table() """ if self._metric_json['multinomial_auc_table'] is not None: return self._metric_json['multinomial_auc_table'] else: return "Table is not computed because it is disabled (model parameter 'auc_type' is set to AUTO or NONE) or due to domain size (maximum is 50 domains)."
[docs] def multinomial_aucpr_table(self): """Retrieve the multinomial PR AUC values. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["cylinders"] = cars["cylinders"].asfactor() >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> response_col = "cylinders" >>> distribution = "multinomial" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution = distribution) >>> gbm.train(x=predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> gbm.multinomial_aucpr_table() """ if self._metric_json['multinomial_aucpr_table'] is not None: return self._metric_json['multinomial_aucpr_table'] else: return "Table is not computed because it is disabled (model parameter 'auc_type' is set to AUTO or NONE) or due to domain size (maximum is 50 domains)."
[docs]class H2OOrdinalModelMetrics(MetricsBase): def __init__(self, metric_json, on=None, algo=""): super(H2OOrdinalModelMetrics, self).__init__(metric_json, on, algo)
[docs] def confusion_matrix(self): """Returns a confusion matrix based of H2O's default prediction threshold for a dataset. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["cylinders"] = cars["cylinders"].asfactor() >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> response_col = "cylinders" >>> distribution = "multinomial" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution = distribution) >>> gbm.train(x=predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> gbm.confusion_matrix(train) """ return self._metric_json['cm']['table']
[docs] def hit_ratio_table(self): """Retrieve the Hit Ratios. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["cylinders"] = cars["cylinders"].asfactor() >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> response_col = "cylinders" >>> distribution = "multinomial" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution = distribution) >>> gbm.train(x=predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> gbm.hit_ratio_table() """ return self._metric_json['hit_ratio_table']
[docs]class H2OHGLMModelMetrics(MetricsBase): def __init__(self, metric_json, on=None, algo="HGLM Gaussian Gaussian"): super(H2OHGLMModelMetrics, self).__init__(metric_json, on, algo)
[docs]class H2OBinomialModelMetrics(MetricsBase): """ This class is essentially an API for the AUC object. This class contains methods for inspecting the AUC for different criteria. To input the different criteria, use the static variable ``criteria``. """ def __init__(self, metric_json, on=None, algo=""): """ Create a new Binomial Metrics object (essentially a wrapper around some json) :param metric_json: A blob of json holding all of the needed information :param on_train: Metrics built on training data (default is False) :param on_valid: Metrics built on validation data (default is False) :param on_xval: Metrics built on cross validation data (default is False) :param algo: The algorithm the metrics are based off of (e.g. deeplearning, gbm, etc.) :returns: A new H2OBinomialModelMetrics object. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.auc(valid=True) """ super(H2OBinomialModelMetrics, self).__init__(metric_json, on, algo)
[docs] def F1(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The F1 for the given set of thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.F1() """ return self.metric("f1", thresholds=thresholds)
[docs] def F2(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The F2 for this set of metrics and thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.F2() """ return self.metric("f2", thresholds=thresholds)
[docs] def F0point5(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The F0.5 for this set of metrics and thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.F0point5() """ return self.metric("f0point5", thresholds=thresholds)
[docs] def accuracy(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The accuracy for this set of metrics and thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.accuracy() """ return self.metric("accuracy", thresholds=thresholds)
[docs] def error(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold minimizing the error will be used. :returns: The error for this set of metrics and thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.error() """ return H2OBinomialModelMetrics._accuracy_to_error(self.metric("accuracy", thresholds=thresholds))
[docs] def precision(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The precision for this set of metrics and thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.precision() """ return self.metric("precision", thresholds=thresholds)
[docs] def tpr(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The True Postive Rate. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.tpr() """ return self.metric("tpr", thresholds=thresholds)
[docs] def tnr(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The True Negative Rate. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.tnr() """ return self.metric("tnr", thresholds=thresholds)
[docs] def fnr(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The False Negative Rate. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.fnr() """ return self.metric("fnr", thresholds=thresholds)
[docs] def fpr(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The False Positive Rate. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.fpr() """ return self.metric("fpr", thresholds=thresholds)
[docs] def recall(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: Recall for this set of metrics and thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.recall() """ return self.metric("recall", thresholds=thresholds)
[docs] def sensitivity(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: Sensitivity or True Positive Rate for this set of metrics and thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.sensitivity() """ return self.metric("sensitivity", thresholds=thresholds)
[docs] def fallout(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The fallout (same as False Positive Rate) for this set of metrics and thresholds. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.fallout() """ return self.metric("fallout", thresholds=thresholds)
[docs] def missrate(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The miss rate (same as False Negative Rate). :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.missrate() """ return self.metric("missrate", thresholds=thresholds)
[docs] def specificity(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The specificity (same as True Negative Rate). :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.specificity() """ return self.metric("specificity", thresholds=thresholds)
[docs] def mcc(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. :returns: The absolute MCC (a value between 0 and 1, 0 being totally dissimilar, 1 being identical). :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.mcc() """ return self.metric("absolute_mcc", thresholds=thresholds)
[docs] def max_per_class_error(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold minimizing the error will be used. :returns: Return 1 - min(per class accuracy). :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.max_per_class_error() """ return H2OBinomialModelMetrics._accuracy_to_error(self.metric("min_per_class_accuracy", thresholds=thresholds))
[docs] def mean_per_class_error(self, thresholds=None): """ :param thresholds: thresholds parameter must be a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold minimizing the error will be used. :returns: mean per class error. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.mean_per_class_error() """ return H2OBinomialModelMetrics._accuracy_to_error(self.metric("mean_per_class_accuracy", thresholds=thresholds))
@staticmethod def _accuracy_to_error(accuracies): errors = List() errors.extend([acc[0], 1 - acc[1]] for acc in accuracies) setattr(errors, 'value', [1 - v for v in accuracies.value] if isinstance(accuracies.value, list) else 1 - accuracies.value ) return errors
[docs] def metric(self, metric, thresholds=None): """ :param str metric: A metric among :const:`maximizing_metrics`. :param thresholds: thresholds parameter must be a number or a list (e.g. ``[0.01, 0.5, 0.99]``). If None, then the threshold maximizing the metric will be used. If 'all', then all stored thresholds are used and returned with the matching metric. :returns: The set of metrics for the list of thresholds. The returned list has a 'value' property holding only the metric value (if no threshold provided or if provided as a number), or all the metric values (if thresholds provided as a list) :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> local_data = [[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'], ... [1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'], ... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'], ... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b']] >>> h2o_data = h2o.H2OFrame(local_data) >>> h2o_data.set_names(['response', 'predictor']) >>> h2o_data["response"] = h2o_data["response"].asfactor() >>> gbm = H2OGradientBoostingEstimator(ntrees=1, ... distribution="bernoulli") >>> gbm.train(x=list(range(1,h2o_data.ncol)), ... y="response", ... training_frame=h2o_data) >>> perf = gbm.model_performance() >>> perf.metric("tps", [perf.find_threshold_by_max_metric("f1")])[0][1] """ assert_is_type(thresholds, None, 'all', numeric, [numeric]) if metric not in H2OBinomialModelMetrics.maximizing_metrics: raise ValueError("The only allowable metrics are {}".format(', '.join(H2OBinomialModelMetrics.maximizing_metrics))) h2o_metric = (H2OBinomialModelMetrics.metrics_aliases[metric] if metric in H2OBinomialModelMetrics.metrics_aliases else metric) value_is_scalar = is_type(metric, str) and (thresholds is None or is_type(thresholds, numeric)) if thresholds is None: thresholds = [self.find_threshold_by_max_metric(h2o_metric)] elif thresholds == 'all': thresholds = None elif is_type(thresholds, numeric): thresholds = [thresholds] metrics = List() thresh2d = self._metric_json['thresholds_and_metric_scores'] if thresholds is None: # fast path to return all thresholds: skipping find_idx logic metrics.extend(list(t) for t in zip(thresh2d['threshold'], thresh2d[h2o_metric])) else: for t in thresholds: idx = self.find_idx_by_threshold(t) metrics.append([t, thresh2d[h2o_metric][idx]]) setattr(metrics, 'value', metrics[0][1] if value_is_scalar else list(r[1] for r in metrics) ) return metrics
[docs] @deprecated_params({'save_to_file': 'save_plot_path'}) def plot(self, type="roc", server=False, save_plot_path=None, plot=True): """ Produce the desired metric plot. :param type: the type of metric plot. One of (currently supported): - ROC curve ('roc') - Precision Recall curve ('pr') - Gains Lift curve ('gainslift') :param server: if True, generate plot inline using matplotlib's Anti-Grain Geometry (AGG) backend. :param save_plot_path: filename to save the plot to. :param plot: ``True`` to plot curve; ``False`` to get a tuple of values at axis x and y of the plot (tprs and fprs for AUC, recall and precision for PR). :returns: None or values of x and y axis of the plot + the resulting plot (can be accessed using ``result.figure()``). :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed = 1234) >>> cars_gbm.train(x = predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> cars_gbm.plot(type="roc") >>> cars_gbm.plot(type="pr") """ assert type in ["roc", "pr", "gains_lift"] if type == "roc": return self._plot_roc(server, save_plot_path, plot) elif type == "pr": return self._plot_pr(server, save_plot_path, plot) elif type == "gains_lift": return self.gains_lift_plot(server=server, save_plot_path=save_plot_path, plot=plot)
def _plot_roc(self, server=False, save_to_file=None, plot=True): if plot: plt = get_matplotlib_pyplot(server) if plt is None: return decorate_plot_result(figure=RAISE_ON_FIGURE_ACCESS) fig = plt.figure() plt.xlabel('False Positive Rate (FPR)') plt.ylabel('True Positive Rate (TPR)') plt.title('Receiver Operating Characteristic Curve') plt.text(0.5, 0.5, r'AUC={0:.4f}'.format(self._metric_json["AUC"])) plt.plot(self.fprs, self.tprs, 'b--') plt.axis([0, 1, 0, 1]) plt.grid(True) plt.tight_layout() if not server: plt.show() if save_to_file is not None: # only save when a figure is actually plotted fig.savefig(fname=save_to_file) return decorate_plot_result(res=(self.fprs, self.tprs), figure=fig) else: return decorate_plot_result(res=(self.fprs, self.tprs)) def _plot_pr(self, server=False, save_to_file=None, plot=True): recalls = [x[0] for x in self.recall(thresholds='all')] precisions = self.tprs assert len(precisions) == len(recalls), "Precision and recall arrays must have the same length" if plot: plt = get_matplotlib_pyplot(server) if plt is None: return decorate_plot_result(figure=RAISE_ON_FIGURE_ACCESS) fig = plt.figure() plt.xlabel('Recall (TP/(TP+FP))') plt.ylabel('Precision (TPR)') plt.title('Precision Recall Curve') plt.text(0.75, 0.95, r'auc_pr={0:.4f}'.format(self._metric_json["pr_auc"])) plt.plot(recalls, precisions, 'b--') plt.axis([0, 1, 0, 1]) plt.grid(True) plt.tight_layout() if not server: plt.show() if save_to_file is not None: # only save when a figure is actually plotted plt.savefig(fname=save_to_file) return decorate_plot_result(res=(recalls, precisions), figure=fig) else: return decorate_plot_result(res=(recalls, precisions)) @property def fprs(self): """ Return all false positive rates for all threshold values. :returns: a list of false positive rates. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> r = cars[0].runif() >>> train = cars[r > .2] >>> valid = cars[r <= .2] >>> response_col = "economy_20mpg" >>> distribution = "bernoulli" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, distribution=distribution, fold_assignment="Random") >>> gbm.train(y=response_col, x=predictors, validation_frame=valid, training_frame=train) >>> (fprs, tprs) = gbm.roc(train=True, valid=False, xval=False) >>> fprs """ return self._metric_json["thresholds_and_metric_scores"]["fpr"] @property def tprs(self): """ Return all true positive rates for all threshold values. :returns: a list of true positive rates. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> r = cars[0].runif() >>> train = cars[r > .2] >>> valid = cars[r <= .2] >>> response_col = "economy_20mpg" >>> distribution = "bernoulli" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, distribution=distribution, fold_assignment="Random") >>> gbm.train(y=response_col, x=predictors, validation_frame=valid, training_frame=train) >>> (fprs, tprs) = gbm.roc(train=True, valid=False, xval=False) >>> tprs """ return self._metric_json["thresholds_and_metric_scores"]["tpr"]
[docs] def roc(self): """ Return the coordinates of the ROC curve as a tuple containing the false positive rates as a list and true positive rates as a list. :returns: The ROC values. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> r = cars[0].runif() >>> train = cars[r > .2] >>> valid = cars[r <= .2] >>> response_col = "economy_20mpg" >>> distribution = "bernoulli" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution=distribution, ... fold_assignment="Random") >>> gbm.train(x=predictors, ... y=response_col, ... validation_frame=valid, ... training_frame=train) >>> gbm.roc(train=True, valid=False, xval=False) """ return self.fprs, self.tprs
metrics_aliases = dict( fallout='fpr', missrate='fnr', recall='tpr', sensitivity='tpr', specificity='tnr' ) #: metrics names allowed for confusion matrix maximizing_metrics = ('absolute_mcc', 'accuracy', 'precision', 'f0point5', 'f1', 'f2', 'mean_per_class_accuracy', 'min_per_class_accuracy', 'tns', 'fns', 'fps', 'tps', 'tnr', 'fnr', 'fpr', 'tpr') + tuple(metrics_aliases.keys())
[docs] def confusion_matrix(self, metrics=None, thresholds=None): """ Get the confusion matrix for the specified metric. :param metrics: A string (or list of strings) among metrics listed in :const:`maximizing_metrics`. Defaults to ``'f1'``. :param thresholds: A value (or list of values) between 0 and 1. If None, then the thresholds maximizing each provided metric will be used. :returns: a list of ConfusionMatrix objects (if there are more than one to return), a single ConfusionMatrix (if there is only one), or None if thresholds are metrics scores are missing. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["cylinders"] = cars["cylinders"].asfactor() >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> response = "cylinders" >>> distribution = "multinomial" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution=distribution) >>> gbm.train(x=predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> gbm.confusion_matrix(train) """ thresh2d = self._metric_json['thresholds_and_metric_scores'] if thresh2d is None: return None # make lists out of metrics and thresholds arguments if metrics is None and thresholds is None: metrics = ['f1'] if isinstance(metrics, list): metrics_list = metrics elif metrics is None: metrics_list = [] else: metrics_list = [metrics] if isinstance(thresholds, list): thresholds_list = thresholds elif thresholds is None: thresholds_list = [] else: thresholds_list = [thresholds] # error check the metrics_list and thresholds_list assert_is_type(thresholds_list, [numeric]) assert_satisfies(thresholds_list, all(0 <= t <= 1 for t in thresholds_list)) if not all(m.lower() in H2OBinomialModelMetrics.maximizing_metrics for m in metrics_list): raise ValueError("The only allowable metrics are {}".format(', '.join(H2OBinomialModelMetrics.maximizing_metrics))) # make one big list that combines the thresholds and metric-thresholds metrics_thresholds = [self.find_threshold_by_max_metric(m) for m in metrics_list] for mt in metrics_thresholds: thresholds_list.append(mt) first_metrics_thresholds_offset = len(thresholds_list) - len(metrics_thresholds) actual_thresholds = [float(e[0]) for i, e in enumerate(thresh2d.cell_values)] cms = [] for i, t in enumerate(thresholds_list): idx = self.find_idx_by_threshold(t) row = thresh2d.cell_values[idx] tns = row[11] fns = row[12] fps = row[13] tps = row[14] p = tps + fns n = tns + fps c0 = n - fps c1 = p - tps if t in metrics_thresholds: m = metrics_list[i - first_metrics_thresholds_offset] table_header = "Confusion Matrix (Act/Pred) for max {} @ threshold = {}".format(m, actual_thresholds[idx]) else: table_header = "Confusion Matrix (Act/Pred) @ threshold = {}".format(actual_thresholds[idx]) cms.append(ConfusionMatrix(cm=[[c0, fps], [c1, tps]], domains=self._metric_json['domain'], table_header=table_header)) if len(cms) == 1: return cms[0] else: return cms
[docs] def find_threshold_by_max_metric(self, metric): """ :param metrics: A string among the metrics listed in :const:`maximizing_metrics`. :returns: the threshold at which the given metric is maximal. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> local_data = [[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'], ... [1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'], ... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'], ... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b']] >>> h2o_data = h2o.H2OFrame(local_data) >>> h2o_data.set_names(['response', 'predictor']) >>> h2o_data["response"] = h2o_data["response"].asfactor() >>> gbm = H2OGradientBoostingEstimator(ntrees=1, ... distribution="bernoulli") >>> gbm.train(x=list(range(1,h2o_data.ncol)), ... y="response", ... training_frame=h2o_data) >>> perf = gbm.model_performance() >>> perf.find_threshold_by_max_metric("f1") """ crit2d = self._metric_json['max_criteria_and_metric_scores'] # print(crit2d) h2o_metric = (H2OBinomialModelMetrics.metrics_aliases[metric] if metric in H2OBinomialModelMetrics.metrics_aliases else metric) for e in crit2d.cell_values: if e[0] == "max " + h2o_metric.lower(): return e[1] raise ValueError("No metric " + str(metric.lower()))
[docs] def find_idx_by_threshold(self, threshold): """ Retrieve the index in this metric's threshold list at which the given threshold is located. :param threshold: Find the index of this input threshold. :returns: the index. :raises ValueError: if no such index can be found. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> local_data = [[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'], ... [1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'],[1, 'a'], ... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'], ... [0, 'b'],[0, 'b'],[0, 'b'],[0, 'b'],[0, 'b']] >>> h2o_data = h2o.H2OFrame(local_data) >>> h2o_data.set_names(['response', 'predictor']) >>> h2o_data["response"] = h2o_data["response"].asfactor() >>> gbm = H2OGradientBoostingEstimator(ntrees=1, ... distribution="bernoulli") >>> gbm.train(x=list(range(1,h2o_data.ncol)), ... y="response", ... training_frame=h2o_data) >>> perf = gbm.model_performance() >>> perf.find_idx_by_threshold(0.45) """ assert_is_type(threshold, numeric) thresh2d = self._metric_json['thresholds_and_metric_scores'] # print(thresh2d) for i, e in enumerate(thresh2d.cell_values): t = float(e[0]) if abs(t - threshold) < 1e-8 * max(t, threshold): return i if 0 <= threshold <= 1: thresholds = [float(e[0]) for i, e in enumerate(thresh2d.cell_values)] threshold_diffs = [abs(t - threshold) for t in thresholds] closest_idx = threshold_diffs.index(min(threshold_diffs)) closest_threshold = thresholds[closest_idx] print("Could not find exact threshold {0}; using closest threshold found {1}." .format(threshold, closest_threshold)) return closest_idx raise ValueError("Threshold must be between 0 and 1, but got {0} ".format(threshold))
[docs] def gains_lift(self): """Retrieve the Gains/Lift table. :examples: >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["cylinders"] = cars["cylinders"].asfactor() >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> response_col = "cylinders" >>> distribution = "multinomial" >>> predictors = ["displacement","power","weight","acceleration","year"] >>> gbm = H2OGradientBoostingEstimator(nfolds=3, ... distribution=distribution) >>> gbm.train(x=predictors, ... y = response, ... training_frame = train, ... validation_frame = valid) >>> gbm.gains_lift() """ if 'gains_lift_table' in self._metric_json: return self._metric_json['gains_lift_table'] return None
[docs] @deprecated_params({'save_to_file': 'save_plot_path'}) def gains_lift_plot(self, type="both", server=False, save_plot_path=None, plot=True): """ Plot Gains/Lift curves. :param type: one of: - "both" (default) - "gains" - "lift" :param server: if ``True``, generate plot inline using matplotlib's Anti-Grain Geometry (AGG) backend. :param save_plot_path: filename to save the plot to. :param plot: ``True`` to plot curve; ``False`` to get a gains lift table. :returns: Gains lift table + the resulting plot (can be accessed using ``result.figure()``). """ type = type.lower() assert type in ["both", "gains", "lift"] gl = self.gains_lift() if plot: plt = get_matplotlib_pyplot(server) if plt is None: return decorate_plot_result(figure=RAISE_ON_FIGURE_ACCESS) title = [] ylab = [] x = gl['cumulative_data_fraction'] yccr = gl['cumulative_capture_rate'] ycl = gl['cumulative_lift'] plt = get_matplotlib_pyplot(server=False, raise_if_not_available=True) fig = plt.figure(figsize=(10, 10)) plt.grid(True) if type in ["both", "gains"]: plt.plot(x, yccr, zorder=10, label='cumulative capture rate') title.append("Gains") ylab.append('cumulative capture rate') if type in ["both", "lift"]: plt.plot(x, ycl, zorder=10, label='cumulative lift') title.append("Lift") ylab.append('cumulative lift') plt.legend(loc=4, fancybox=True, framealpha=0.5) plt.xlim(0, None) plt.ylim(0, None) plt.xlabel('cumulative data fraction') plt.ylabel(", ".join(ylab)) plt.title(" / ".join(title)) if not server: plt.show() if save_plot_path is not None: # only save when a figure is actually plotted fig.savefig(fname=save_plot_path) return decorate_plot_result(res=gl, figure=fig) else: return decorate_plot_result(res=gl)
[docs]class H2OBinomialUpliftModelMetrics(MetricsBase): """ This class is available only for Uplift DRF model. This class is essentially an API for the AUUC object. """ def __init__(self, metric_json, on=None, algo=""): """ Create a new Binomial Uplift Metrics object (essentially a wrapper around some json) :param metric_json: A blob of json holding all of the needed information :param on: Metrics built on "training_data" or "validation_data" (default is "training_data") """ super(H2OBinomialUpliftModelMetrics, self).__init__(metric_json, on, algo)
[docs] def auuc(self, metric=None): """ Retrieve area under cumulative uplift curve (AUUC) value. :param metric: AUUC metric type. One of: - "None" (default; takes default metric from model parameters) - "AUTO" (defaults to "qini") - "qini" - "lift" - "gain" :returns: AUUC value. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.auuc() """ if metric is None: return self._metric_json['AUUC'] else: assert metric in ['AUTO', 'qini', 'lift', 'gain'],\ "AUUC metric "+metric+" should be 'AUTO', 'qini','lift' or 'gain'." if metric == "AUTO": metric = 'qini' return self._metric_json['auuc_table'][metric][0]
[docs] def auuc_normalized(self, metric=None): """ Retrieve normalized area under cumulative uplift curve (AUUC) value. :param metric: AUUC metric type. One of: - "None" (default; takes default metric from model parameters) - "AUTO" (defaults to "qini") - "qini" - "lift" - "gain" :returns: normalized AUUC value. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.auuc_normalized() """ if metric is None: return self._metric_json['auuc_normalized'] else: assert metric in ['AUTO', 'qini', 'lift', 'gain'], \ "AUUC metric "+metric+" should be 'AUTO', 'qini','lift' or 'gain'." if metric == "AUTO": metric = 'qini' return self._metric_json['auuc_table'][metric][1]
[docs] def qini(self): """ Retrieve Qini value (area between Qini cumulative uplift curve and random curve). :returns: Qini value. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.qini() """ return self._metric_json['qini']
[docs] def aecu(self, metric="AUTO"): """ Retrieve AECU value (average excess cumulative uplift - area between Uplift curve and random curve). :param metric: AECU metric type One of: - "None" - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :returns: AECU value. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.aecu() """ assert metric in ['AUTO', 'qini', 'lift', 'gain'], \ "AECU metric "+metric+" should be 'qini','lift' or 'gain'." if metric == 'AUTO': metric = 'qini' return self._metric_json['aecu_table'][metric][0]
[docs] def uplift(self, metric="AUTO"): """ Retrieve uplift values for each bin. :param metric: AUUC metric type. One of: - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :returns: a list of uplift values. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.uplift() """ assert metric in ['AUTO', 'qini', 'lift', 'gain'] if metric == "AUTO": metric = 'qini' return self._metric_json["thresholds_and_metric_scores"][metric]
[docs] def uplift_normalized(self, metric="AUTO"): """ Retrieve normalized uplift values for each bin. :param metric: AUUC metric type. One of: - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :returns: a list of normalized uplift values. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.uplift_normalized() """ assert metric in ['AUTO', 'qini', 'lift', 'gain'] if metric == "AUTO": metric = 'qini' return self._metric_json["thresholds_and_metric_scores"][metric+"_normalized"]
[docs] def uplift_random(self, metric="AUTO"): """ Retrieve random uplift values for each bin. :param metric: AUUC metric type. One of: - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :returns: a list of random uplift values. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.uplift_random() """ assert metric in ['AUTO', 'qini', 'lift', 'gain'] if metric == "AUTO": metric = 'qini' return self._metric_json["thresholds_and_metric_scores"][metric+"_random"]
[docs] def n(self): """ Retrieve cumulative sum of numbers of observations in each bin. :returns: a list of numbers of observation. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.n() """ return self._metric_json["thresholds_and_metric_scores"]["n"]
[docs] def thresholds(self): """ Retrieve prediction thresholds for each bin. :returns: a list of thresholds. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.thresholds() """ return self._metric_json["thresholds_and_metric_scores"]["thresholds"]
[docs] def thresholds_and_metric_scores(self): """ Retrieve thresholds and metric scores table. :returns: a thresholds and metric scores table for the specified key(s). :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.thresholds_and_metric_scores() """ return self._metric_json["thresholds_and_metric_scores"]
[docs] def auuc_table(self): """ Retrieve all types of AUUC in a table. :returns: a table of AUUCs. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.auuc_table() """ return self._metric_json["auuc_table"]
[docs] def aecu_table(self): """ Retrieve all types of AECU values in a table. :returns: a table of AECU values. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.aecu_table() """ return self._metric_json["aecu_table"]
[docs] def plot_uplift(self, server=False, save_to_file=None, plot=True, metric="AUTO", normalize=False): """ Plot Uplift Curve. :param server: if ``True``, generate plot inline using matplotlib's Anti-Grain Geometry (AGG) backend. :param save_to_file: filename to save the plot to. :param plot: ``True`` to plot curve, ``False`` to get a tuple of values at axis x and y of the plot (number of observations and uplift values) :param metric: AUUC metric type. One of: - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :param normalize: If ``True``, normalized values are plotted. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.plot_uplift(plot=True) >>> n, uplift = perf.plot_uplift(plot=False) """ assert metric in ['AUTO', 'qini', 'lift', 'gain'], \ "Metric "+metric+" should be 'AUTO', 'qini','lift' or 'gain'." if plot: plt = get_matplotlib_pyplot(server) if plt is None: return plt.ylabel('Cumulative '+metric) plt.xlabel('Number Targeted') rnd = self.uplift_random(metric) if normalize: plt.title('Cumulate Uplift Curve - '+metric+"\n"+r'Normalized AUUC={0:.4f}'.format(self.auuc_normalized(metric))) uplift = self.uplift_normalized(metric) if metric != "lift": max = abs(rnd[len(rnd)-1]) rnd = [x / max for x in rnd] else: plt.title('Cumulate Uplift Curve - '+metric+"\n"+r'AUUC={0:.4f}'.format(self.auuc(metric))) uplift = self.uplift(metric) n = self.n() plt.plot(n, uplift, 'b-', label='uplift') plt.plot(n, rnd, 'k--', label='random') if metric == "lift": plt.legend(loc='upper right') else: plt.legend(loc='lower right') plt.grid(True) plt.tight_layout() if not server: plt.show() if save_to_file is not None: # only save when a figure is actually plotted plt.savefig(save_to_file) else: return self.n(), self.uplift(metric)
[docs]class H2OAutoEncoderModelMetrics(MetricsBase): """ :examples: >>> from h2o.estimators.deeplearning import H2OAutoEncoderEstimator >>> train_ecg = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_train.csv") >>> test_ecg = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_test.csv") >>> anomaly_model = H2OAutoEncoderEstimator(activation="Tanh", ... hidden=[50,50,50], ... sparse=True, l1=1e-4, ... epochs=100) >>> anomaly_model.train(x=train_ecg.names, training_frame=train_ecg) >>> anomaly_model.mse() """ def __init__(self, metric_json, on=None, algo=""): super(H2OAutoEncoderModelMetrics, self).__init__(metric_json, on, algo)
[docs]class H2ODimReductionModelMetrics(MetricsBase): def __init__(self, metric_json, on=None, algo=""): super(H2ODimReductionModelMetrics, self).__init__(metric_json, on, algo)
[docs] def num_err(self): """Sum of Squared Error over non-missing numeric entries, or None if not present.""" if MetricsBase._has(self._metric_json, "numerr"): return self._metric_json["numerr"] return None
[docs] def cat_err(self): """The Number of Misclassified categories over non-missing categorical entries, or None if not present.""" if MetricsBase._has(self._metric_json, "caterr"): return self._metric_json["caterr"] return None
[docs]class H2OWordEmbeddingModelMetrics(MetricsBase): def __init__(self, metric_json, on=None, algo=""): super(H2OWordEmbeddingModelMetrics, self).__init__(metric_json, on, algo)
[docs]class H2OAnomalyDetectionModelMetrics(MetricsBase): def __init__(self, metric_json, on=None, algo=""): super(H2OAnomalyDetectionModelMetrics, self).__init__(metric_json, on, algo)
[docs] def mean_score(self): """ Mean Anomaly Score. For Isolation Forest represents the average of all tree-path lengths. :examples: >>> from h2o.estimators.isolation_forest import H2OIsolationForestEstimator >>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_train.csv") >>> test = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_test.csv") >>> isofor_model = H2OIsolationForestEstimator(sample_size=5, ntrees=7) >>> isofor_model.train(training_frame = train) >>> perf = isofor_model.model_performance() >>> perf.mean_score() """ if MetricsBase._has(self._metric_json, "mean_score"): return self._metric_json["mean_score"] return None
[docs] def mean_normalized_score(self): """ Mean Normalized Anomaly Score. For Isolation Forest - normalized average path length. :examples: >>> from h2o.estimators.isolation_forest import H2OIsolationForestEstimator >>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_train.csv") >>> test = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_test.csv") >>> isofor_model = H2OIsolationForestEstimator(sample_size=5, ntrees=7) >>> isofor_model.train(training_frame = train) >>> perf = isofor_model.model_performance() >>> perf.mean_normalized_score() """ if MetricsBase._has(self._metric_json, "mean_normalized_score"): return self._metric_json["mean_normalized_score"] return None
[docs]class H2OModelMetricsRegressionCoxPH(MetricsBase): """ :examples: >>> from h2o.estimators.coxph import H2OCoxProportionalHazardsEstimator >>> heart = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv") >>> coxph = H2OCoxProportionalHazardsEstimator(start_column="start", ... stop_column="stop", ... ties="breslow") >>> coxph.train(x="age", y="event", training_frame=heart) >>> coxph """
[docs] def concordance(self): """Concordance metrics (c-index). Proportion of concordant pairs divided by the total number of possible evaluation pairs. 1.0 for perfect match, 0.5 for random results.""" if MetricsBase._has(self._metric_json, "concordance"): return self._metric_json["concordance"] return None
[docs] def concordant(self): """Count of concordant pairs.""" if MetricsBase._has(self._metric_json, "concordant"): return self._metric_json["concordant"] return None
[docs] def tied_y(self): """Count of tied pairs.""" if MetricsBase._has(self._metric_json, "tied_y"): return self._metric_json["tied_y"] return None
def __init__(self, metric_json, on=None, algo=""): super(H2OModelMetricsRegressionCoxPH, self).__init__(metric_json, on, algo)
[docs]class H2OTargetEncoderMetrics(MetricsBase): def __init__(self, metric_json, on=None, algo=""): super(H2OTargetEncoderMetrics, self).__init__(metric_json, on, algo)
[docs]class List(list): pass