Source code for h2o.model.metrics.uplift

import h2o
from h2o.model import MetricsBase
from h2o.plot import get_matplotlib_pyplot


[docs]class H2OBinomialUpliftModelMetrics(MetricsBase): """ This class is available only for Uplift DRF model. This class is essentially an API for the AUUC object. """ def _str_items_custom(self): items = [ "AUUC: {}".format(self.auuc()), "AUUC normalized: {}".format(self.auuc_normalized()), ] auuct = self.auuc_table() if auuct: items.append(auuct) items.append("Qini value: {}".format(self.qini())) aecut = self.aecu_table() if aecut: items.append(aecut) return items
[docs] def auuc(self, metric=None): """ Retrieve area under cumulative uplift curve (AUUC) value. :param metric: AUUC metric type. One of: - "None" (default; takes default metric from model parameters) - "AUTO" (defaults to "qini") - "qini" - "lift" - "gain" :returns: AUUC value. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.auuc() """ if metric is None: return self._metric_json['AUUC'] else: assert metric in ['AUTO', 'qini', 'lift', 'gain'],\ "AUUC metric "+metric+" should be 'AUTO', 'qini','lift' or 'gain'." if metric == "AUTO": metric = 'qini' return self._metric_json['auuc_table'][metric][0]
[docs] def auuc_normalized(self, metric=None): """ Retrieve normalized area under cumulative uplift curve (AUUC) value. :param metric: AUUC metric type. One of: - "None" (default; takes default metric from model parameters) - "AUTO" (defaults to "qini") - "qini" - "lift" - "gain" :returns: normalized AUUC value. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.auuc_normalized() """ if metric is None: return self._metric_json['auuc_normalized'] else: assert metric in ['AUTO', 'qini', 'lift', 'gain'], \ "AUUC metric "+metric+" should be 'AUTO', 'qini','lift' or 'gain'." if metric == "AUTO": metric = 'qini' return self._metric_json['auuc_table'][metric][1]
[docs] def qini(self): """ Retrieve Qini value (area between Qini cumulative uplift curve and random curve). :returns: Qini value. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.qini() """ return self._metric_json['qini']
[docs] def aecu(self, metric="AUTO"): """ Retrieve AECU value (average excess cumulative uplift - area between Uplift curve and random curve). :param metric: AECU metric type One of: - "None" - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :returns: AECU value. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.aecu() """ assert metric in ['AUTO', 'qini', 'lift', 'gain'], \ "AECU metric "+metric+" should be 'qini','lift' or 'gain'." if metric == 'AUTO': metric = 'qini' return self._metric_json['aecu_table'][metric][0]
[docs] def uplift(self, metric="AUTO"): """ Retrieve uplift values for each bin. :param metric: AUUC metric type. One of: - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :returns: a list of uplift values. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.uplift() """ assert metric in ['AUTO', 'qini', 'lift', 'gain'] if metric == "AUTO": metric = 'qini' return self._metric_json["thresholds_and_metric_scores"][metric]
[docs] def uplift_normalized(self, metric="AUTO"): """ Retrieve normalized uplift values for each bin. :param metric: AUUC metric type. One of: - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :returns: a list of normalized uplift values. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.uplift_normalized() """ assert metric in ['AUTO', 'qini', 'lift', 'gain'] if metric == "AUTO": metric = 'qini' return self._metric_json["thresholds_and_metric_scores"][metric+"_normalized"]
[docs] def uplift_random(self, metric="AUTO"): """ Retrieve random uplift values for each bin. :param metric: AUUC metric type. One of: - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :returns: a list of random uplift values. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.uplift_random() """ assert metric in ['AUTO', 'qini', 'lift', 'gain'] if metric == "AUTO": metric = 'qini' return self._metric_json["thresholds_and_metric_scores"][metric+"_random"]
[docs] def n(self): """ Retrieve cumulative sum of numbers of observations in each bin. :returns: a list of numbers of observation. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.n() """ return self._metric_json["thresholds_and_metric_scores"]["n"]
[docs] def thresholds(self): """ Retrieve prediction thresholds for each bin. :returns: a list of thresholds. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.thresholds() """ return self._metric_json["thresholds_and_metric_scores"]["thresholds"]
[docs] def thresholds_and_metric_scores(self): """ Retrieve thresholds and metric scores table. :returns: a thresholds and metric scores table for the specified key(s). :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.thresholds_and_metric_scores() """ return self._metric_json["thresholds_and_metric_scores"]
[docs] def auuc_table(self): """ Retrieve all types of AUUC in a table. :returns: a table of AUUCs. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.auuc_table() """ return self._metric_json["auuc_table"]
[docs] def aecu_table(self): """ Retrieve all types of AECU values in a table. :returns: a table of AECU values. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.aecu_table() """ return self._metric_json["aecu_table"]
[docs] def plot_uplift(self, server=False, save_to_file=None, plot=True, metric="AUTO", normalize=False): """ Plot Uplift Curve. :param server: if ``True``, generate plot inline using matplotlib's Anti-Grain Geometry (AGG) backend. :param save_to_file: filename to save the plot to. :param plot: ``True`` to plot curve, ``False`` to get a tuple of values at axis x and y of the plot (number of observations and uplift values) :param metric: AUUC metric type. One of: - "qini" - "lift" - "gain" - "AUTO" (default; defaults to "qini") :param normalize: If ``True``, normalized values are plotted. :examples: >>> from h2o.estimators import H2OUpliftRandomForestEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") >>> treatment_column = "treatment" >>> response_column = "conversion" >>> train[treatment_column] = train[treatment_column].asfactor() >>> train[response_column] = train[response_column].asfactor() >>> predictors = ["f1", "f2", "f3", "f4", "f5", "f6"] >>> >>> uplift_model = H2OUpliftRandomForestEstimator(ntrees=10, ... max_depth=5, ... treatment_column=treatment_column, ... uplift_metric="kl", ... distribution="bernoulli", ... min_rows=10, ... auuc_type="gain") >>> uplift_model.train(y=response_column, x=predictors, training_frame=train) >>> perf = uplift_model.model_performance() >>> perf.plot_uplift(plot=True) >>> n, uplift = perf.plot_uplift(plot=False) """ assert metric in ['AUTO', 'qini', 'lift', 'gain'], \ "Metric "+metric+" should be 'AUTO', 'qini','lift' or 'gain'." if plot: plt = get_matplotlib_pyplot(server) if plt is None: return plt.ylabel('Cumulative '+metric) plt.xlabel('Number Targeted') rnd = self.uplift_random(metric) if normalize: plt.title('Cumulate Uplift Curve - '+metric+"\n"+r'Normalized AUUC={0:.4f}'.format(self.auuc_normalized(metric))) uplift = self.uplift_normalized(metric) if metric != "lift": max = abs(rnd[len(rnd)-1]) rnd = [x / max for x in rnd] else: plt.title('Cumulate Uplift Curve - '+metric+"\n"+r'AUUC={0:.4f}'.format(self.auuc(metric))) uplift = self.uplift(metric) n = self.n() plt.plot(n, uplift, 'b-', label='uplift') plt.plot(n, rnd, 'k--', label='random') if metric == "lift": plt.legend(loc='upper right') else: plt.legend(loc='lower right') plt.grid(True) plt.tight_layout() if not server: plt.show() if save_to_file is not None: # only save when a figure is actually plotted plt.savefig(save_to_file) else: return self.n(), self.uplift(metric)