Source code for h2o.model.models.clustering

# -*- encoding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
from h2o.utils.compatibility import *  # NOQA

from h2o.model import ModelBase


[docs]class H2OClusteringModel(ModelBase):
[docs] def size(self, train=False, valid=False): """ Get the sizes of each cluster. If all are ``False`` (default), then return the training metric value. If more than one option is set to ``True``, then return a dictionary of metrics where the keys are "train" and "valid". This metric is not available in cross-validation metrics. :param bool train: If ``True``, return the cluster sizes for the training data. :param bool valid: If ``True``, return the cluster sizes for the validation data. :returns: The cluster sizes for the specified key(s). :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> size = km.size() # <- Default: return training metrics >>> size >>> size1 = km.size(train=False, ... valid=False) >>> size1 """ tm = ModelBase._get_metrics(self, train, valid, False) m = {} for k, v in tm.items(): m[k] = None if v is None else [v[2] for v in v._metric_json["centroid_stats"].cell_values] return list(m.values())[0] if len(m) == 1 else m
[docs] def num_iterations(self): """Get the number of iterations it took to converge or reach max iterations. :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> km.num_iterations() """ o = self._model_json["output"] return o["model_summary"]["number_of_iterations"][0]
[docs] def betweenss(self, train=False, valid=False, xval=False): """ Get the between cluster sum of squares. If all are ``False`` (default), then return the training metric value. If more than one option is set to ``True``, then return a dictionary of metrics where the keys are "train", "valid", and "xval". :param bool train: If ``True``, return the between cluster sum of squares value for the training data. :param bool valid: If ``True``, return the between cluster sum of squares value for the validation data. :param bool xval: If ``True``, return the between cluster sum of squares value for each of the cross-validated splits. :returns: The between cluster sum of squares values for the specified key(s). :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> betweenss = km.betweenss() # <- Default: return training metrics >>> betweenss >>> betweenss3 = km.betweenss(train=False, ... valid=False, ... xval=True) >>> betweenss3 """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k, v in tm.items(): m[k] = None if v is None else v._metric_json["betweenss"] return list(m.values())[0] if len(m) == 1 else m
[docs] def totss(self, train=False, valid=False, xval=False): """ Get the total sum of squares. If all are ``False`` (default), then return the training metric value. If more than one option is set to ``True``, then return a dictionary of metrics where the keys are "train", "valid", and "xval". :param bool train: If ``True``, return the total sum of squares value for the training data. :param bool valid: If ``True``, return the total sum of squares value for the validation data. :param bool xval: If ``True``, return the total sum of squares value for each of the cross-validated splits. :returns: The total sum of squares values for the specified key(s). :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> totss = km.totss() # <- Default: return training metrics >>> totss """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k, v in tm.items(): m[k] = None if v is None else v._metric_json["totss"] return list(m.values())[0] if len(m) == 1 else m
[docs] def tot_withinss(self, train=False, valid=False, xval=False): """ Get the total within cluster sum of squares. If all are ``False`` (default), then return the training metric value. If more than one option is set to ``True``, then return a dictionary of metrics where the keys are "train", "valid", and "xval". :param bool train: If ``True``, return the total within cluster sum of squares value for the training data. :param bool valid: If ``True``, return the total within cluster sum of squares value for the validation data. :param bool xval: If ``True``, return the total within cluster sum of squares value for each of the cross-validated splits. :returns: The total within cluster sum of squares values for the specified key(s). :examples: >>> >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> tot_withinss = km.tot_withinss() # <- Default: return training metrics >>> tot_withinss >>> tot_withinss2 = km.tot_withinss(train=True, ... valid=False, ... xval=True) >>> tot_withinss2 """ tm = ModelBase._get_metrics(self, train, valid, xval) m = {} for k, v in tm.items(): m[k] = None if v is None else v._metric_json["tot_withinss"] return list(m.values())[0] if len(m) == 1 else m
[docs] def withinss(self, train=False, valid=False): """ Get the within cluster sum of squares for each cluster. If all are ``False`` (default), then return the training metric value. If more than one option is set to ``True``, then return a dictionary of metrics where the keys are "train" and "valid". This metric is not available in cross-validation metrics. :param bool train: If ``True``, return the total sum of squares value for the training data. :param bool valid: If ``True``, return the total sum of squares value for the validation data. :returns: The total sum of squares values for the specified key(s). :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> withinss = km.withinss() # <- Default: return training metrics >>> withinss >>> withinss2 = km.withinss(train=True, ... valid=True) >>> withinss2 """ tm = ModelBase._get_metrics(self, train, valid, False) m = {} for k, v in tm.items(): m[k] = None if v is None else [z[-1] for z in v._metric_json["centroid_stats"].cell_values] return list(m.values())[0] if len(m) == 1 else m
[docs] def centroid_stats(self, train=False, valid=False): """ Get the centroid statistics for each cluster. If all are ``False`` (default), then return the training metric value. If more than one option is set to ``True``, then return a dictionary of metrics where the keys are "train" and "valid". This metric is not available in cross-validation metrics. :param bool train: If ``True``, return the centroid statistic for the training data. :param bool valid: If ``True``, return the centroid statistic for the validation data. :returns: The centroid statistics for the specified key(s). :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> centroid_stats = km.centroid_stats() # <- Default: return training metrics >>> centroid_stats >>> centroid_stats1 = km.centroid_stats(train=True, ... valid=False) >>> centroid_stats1 """ tm = ModelBase._get_metrics(self, train, valid, False) m = {} for k, v in tm.items(): m[k] = None if v is None else v._metric_json["centroid_stats"] return list(m.values())[0] if len(m) == 1 else m
[docs] def centers(self): """The centers for the KMeans model. :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> km.centers() """ o = self._model_json["output"] cvals = o["centers"].cell_values centers = [list(cval[1:]) for cval in cvals] return centers
[docs] def centers_std(self): """The standardized centers for the KMeans model. :examples: >>> from h2o.estimators.kmeans import H2OKMeansEstimator >>> >>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> km = H2OKMeansEstimator(k=3, nfolds=3) >>> km.train(x=list(range(4)), training_frame=iris) >>> km.centers_std() """ o = self._model_json["output"] cvals = o["centers_std"].cell_values centers_std = [list(cval[1:]) for cval in cvals] centers_std = [list(x) for x in zip(*centers_std)] return centers_std