# -*- encoding: utf-8 -*-
from h2o.utils.compatibility import * # NOQA
from h2o.model import ModelBase
[docs]class H2OClusteringModel(ModelBase):
[docs] def size(self, train=False, valid=False):
"""
Get the sizes of each cluster.
If all are ``False`` (default), then return the training metric value.
If more than one option is set to ``True``, then return a dictionary of metrics where
the keys are "train" and "valid". This metric is not available in cross-validation metrics.
:param bool train: If ``True``, return the cluster sizes for the training data.
:param bool valid: If ``True``, return the cluster sizes for the validation data.
:returns: The cluster sizes for the specified key(s).
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> size = km.size() # <- Default: return training metrics
>>> size
>>> size1 = km.size(train=False,
... valid=False)
>>> size1
"""
tm = ModelBase._get_metrics(self, train, valid, False)
m = {}
for k, v in tm.items():
m[k] = None if v is None else [v[2] for v in v._metric_json["centroid_stats"].cell_values]
return list(m.values())[0] if len(m) == 1 else m
[docs] def num_iterations(self):
"""Get the number of iterations it took to converge or reach max iterations.
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> km.num_iterations()
"""
o = self._model_json["output"]
return o["model_summary"]["number_of_iterations"][0]
[docs] def betweenss(self, train=False, valid=False, xval=False):
"""
Get the between cluster sum of squares.
If all are ``False`` (default), then return the training metric value.
If more than one option is set to ``True``, then return a dictionary of metrics where
the keys are "train", "valid", and "xval".
:param bool train: If ``True``, return the between cluster sum of squares value for the training data.
:param bool valid: If ``True``, return the between cluster sum of squares value for the validation data.
:param bool xval: If ``True``, return the between cluster sum of squares value for each of the
cross-validated splits.
:returns: The between cluster sum of squares values for the specified key(s).
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> betweenss = km.betweenss() # <- Default: return training metrics
>>> betweenss
>>> betweenss3 = km.betweenss(train=False,
... valid=False,
... xval=True)
>>> betweenss3
"""
tm = ModelBase._get_metrics(self, train, valid, xval)
m = {}
for k, v in tm.items():
m[k] = None if v is None else v._metric_json["betweenss"]
return list(m.values())[0] if len(m) == 1 else m
[docs] def totss(self, train=False, valid=False, xval=False):
"""
Get the total sum of squares.
If all are ``False`` (default), then return the training metric value.
If more than one option is set to ``True``, then return a dictionary of metrics where
the keys are "train", "valid", and "xval".
:param bool train: If ``True``, return the total sum of squares value for the training data.
:param bool valid: If ``True``, return the total sum of squares value for the validation data.
:param bool xval: If ``True``, return the total sum of squares value for each of the cross-validated splits.
:returns: The total sum of squares values for the specified key(s).
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> totss = km.totss() # <- Default: return training metrics
>>> totss
"""
tm = ModelBase._get_metrics(self, train, valid, xval)
m = {}
for k, v in tm.items():
m[k] = None if v is None else v._metric_json["totss"]
return list(m.values())[0] if len(m) == 1 else m
[docs] def tot_withinss(self, train=False, valid=False, xval=False):
"""
Get the total within cluster sum of squares.
If all are ``False`` (default), then return the training metric value.
If more than one option is set to ``True``, then return a dictionary of metrics where
the keys are "train", "valid", and "xval".
:param bool train: If ``True``, return the total within cluster sum of squares value for the training data.
:param bool valid: If ``True``, return the total within cluster sum of squares value for the validation data.
:param bool xval: If ``True``, return the total within cluster sum of squares value for each of the
cross-validated splits.
:returns: The total within cluster sum of squares values for the specified key(s).
:examples:
>>> >>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> tot_withinss = km.tot_withinss() # <- Default: return training metrics
>>> tot_withinss
>>> tot_withinss2 = km.tot_withinss(train=True,
... valid=False,
... xval=True)
>>> tot_withinss2
"""
tm = ModelBase._get_metrics(self, train, valid, xval)
m = {}
for k, v in tm.items():
m[k] = None if v is None else v._metric_json["tot_withinss"]
return list(m.values())[0] if len(m) == 1 else m
[docs] def withinss(self, train=False, valid=False):
"""
Get the within cluster sum of squares for each cluster.
If all are ``False`` (default), then return the training metric value.
If more than one option is set to ``True``, then return a dictionary of metrics where
the keys are "train" and "valid". This metric is not available in cross-validation metrics.
:param bool train: If ``True``, return the total sum of squares value for the training data.
:param bool valid: If ``True``, return the total sum of squares value for the validation data.
:returns: The total sum of squares values for the specified key(s).
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> withinss = km.withinss() # <- Default: return training metrics
>>> withinss
>>> withinss2 = km.withinss(train=True,
... valid=True)
>>> withinss2
"""
tm = ModelBase._get_metrics(self, train, valid, False)
m = {}
for k, v in tm.items():
m[k] = None if v is None else [z[-1] for z in v._metric_json["centroid_stats"].cell_values]
return list(m.values())[0] if len(m) == 1 else m
[docs] def centroid_stats(self, train=False, valid=False):
"""
Get the centroid statistics for each cluster.
If all are ``False`` (default), then return the training metric value.
If more than one option is set to ``True``, then return a dictionary of metrics where
the keys are "train" and "valid". This metric is not available in cross-validation metrics.
:param bool train: If ``True``, return the centroid statistic for the training data.
:param bool valid: If ``True``, return the centroid statistic for the validation data.
:returns: The centroid statistics for the specified key(s).
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> centroid_stats = km.centroid_stats() # <- Default: return training metrics
>>> centroid_stats
>>> centroid_stats1 = km.centroid_stats(train=True,
... valid=False)
>>> centroid_stats1
"""
tm = ModelBase._get_metrics(self, train, valid, False)
m = {}
for k, v in tm.items():
m[k] = None if v is None else v._metric_json["centroid_stats"]
return list(m.values())[0] if len(m) == 1 else m
[docs] def centers(self):
"""The centers for the KMeans model.
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> km.centers()
"""
o = self._model_json["output"]
cvals = o["centers"].cell_values
centers = [list(cval[1:]) for cval in cvals]
return centers
[docs] def centers_std(self):
"""The standardized centers for the KMeans model.
:examples:
>>> from h2o.estimators.kmeans import H2OKMeansEstimator
>>>
>>> iris = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv")
>>> km = H2OKMeansEstimator(k=3, nfolds=3)
>>> km.train(x=list(range(4)), training_frame=iris)
>>> km.centers_std()
"""
o = self._model_json["output"]
cvals = o["centers_std"].cell_values
centers_std = [list(cval[1:]) for cval in cvals]
centers_std = [list(x) for x in zip(*centers_std)]
return centers_std