Source code for h2o.estimators.kmeans

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OKMeansEstimator(H2OEstimator):
    """
    K-means

    Performs k-means clustering on an H2O dataset.
    """

    algo = "kmeans"
    supervised_learning = False

    def __init__(self,
                 model_id=None,  # type: Optional[Union[None, str, H2OEstimator]]
                 training_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 validation_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 nfolds=0,  # type: int
                 keep_cross_validation_models=True,  # type: bool
                 keep_cross_validation_predictions=False,  # type: bool
                 keep_cross_validation_fold_assignment=False,  # type: bool
                 fold_assignment="auto",  # type: Literal["auto", "random", "modulo", "stratified"]
                 fold_column=None,  # type: Optional[str]
                 ignored_columns=None,  # type: Optional[List[str]]
                 ignore_const_cols=True,  # type: bool
                 score_each_iteration=False,  # type: bool
                 k=1,  # type: int
                 estimate_k=False,  # type: bool
                 user_points=None,  # type: Optional[Union[None, str, H2OFrame]]
                 max_iterations=10,  # type: int
                 standardize=True,  # type: bool
                 seed=-1,  # type: int
                 init="furthest",  # type: Literal["random", "plus_plus", "furthest", "user"]
                 max_runtime_secs=0.0,  # type: float
                 categorical_encoding="auto",  # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]
                 export_checkpoints_dir=None,  # type: Optional[str]
                 cluster_size_constraints=None,  # type: Optional[List[int]]
                 ):
        """
        :param model_id: Destination id for this model; auto-generated if not specified.
               Defaults to ``None``.
        :type model_id: Union[None, str, H2OEstimator], optional
        :param training_frame: Id of the training data frame.
               Defaults to ``None``.
        :type training_frame: Union[None, str, H2OFrame], optional
        :param validation_frame: Id of the validation data frame.
               Defaults to ``None``.
        :type validation_frame: Union[None, str, H2OFrame], optional
        :param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2).
               Defaults to ``0``.
        :type nfolds: int
        :param keep_cross_validation_models: Whether to keep the cross-validation models.
               Defaults to ``True``.
        :type keep_cross_validation_models: bool
        :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models.
               Defaults to ``False``.
        :type keep_cross_validation_predictions: bool
        :param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment.
               Defaults to ``False``.
        :type keep_cross_validation_fold_assignment: bool
        :param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The
               'Stratified' option will stratify the folds based on the response variable, for classification problems.
               Defaults to ``"auto"``.
        :type fold_assignment: Literal["auto", "random", "modulo", "stratified"]
        :param fold_column: Column with cross-validation fold index assignment per observation.
               Defaults to ``None``.
        :type fold_column: str, optional
        :param ignored_columns: Names of columns to ignore for training.
               Defaults to ``None``.
        :type ignored_columns: List[str], optional
        :param ignore_const_cols: Ignore constant columns.
               Defaults to ``True``.
        :type ignore_const_cols: bool
        :param score_each_iteration: Whether to score during each iteration of model training.
               Defaults to ``False``.
        :type score_each_iteration: bool
        :param k: The max. number of clusters. If estimate_k is disabled, the model will find k centroids, otherwise it
               will find up to k centroids.
               Defaults to ``1``.
        :type k: int
        :param estimate_k: Whether to estimate the number of clusters (<=k) iteratively and deterministically.
               Defaults to ``False``.
        :type estimate_k: bool
        :param user_points: This option allows you to specify a dataframe, where each row represents an initial cluster
               center. The user-specified points must have the same number of columns as the training observations. The
               number of rows must equal the number of clusters
               Defaults to ``None``.
        :type user_points: Union[None, str, H2OFrame], optional
        :param max_iterations: Maximum training iterations (if estimate_k is enabled, then this is for each inner Lloyds
               iteration)
               Defaults to ``10``.
        :type max_iterations: int
        :param standardize: Standardize columns before computing distances
               Defaults to ``True``.
        :type standardize: bool
        :param seed: RNG Seed
               Defaults to ``-1``.
        :type seed: int
        :param init: Initialization mode
               Defaults to ``"furthest"``.
        :type init: Literal["random", "plus_plus", "furthest", "user"]
        :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
               Defaults to ``0.0``.
        :type max_runtime_secs: float
        :param categorical_encoding: Encoding scheme for categorical features
               Defaults to ``"auto"``.
        :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
               "sort_by_response", "enum_limited"]
        :param export_checkpoints_dir: Automatically export generated models to this directory.
               Defaults to ``None``.
        :type export_checkpoints_dir: str, optional
        :param cluster_size_constraints: An array specifying the minimum number of points that should be in each
               cluster. The length of the constraints array has to be the same as the number of clusters.
               Defaults to ``None``.
        :type cluster_size_constraints: List[int], optional
        """
        super(H2OKMeansEstimator, self).__init__()
        self._parms = {}
        self._id = self._parms['model_id'] = model_id
        self.training_frame = training_frame
        self.validation_frame = validation_frame
        self.nfolds = nfolds
        self.keep_cross_validation_models = keep_cross_validation_models
        self.keep_cross_validation_predictions = keep_cross_validation_predictions
        self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment
        self.fold_assignment = fold_assignment
        self.fold_column = fold_column
        self.ignored_columns = ignored_columns
        self.ignore_const_cols = ignore_const_cols
        self.score_each_iteration = score_each_iteration
        self.k = k
        self.estimate_k = estimate_k
        self.user_points = user_points
        self.max_iterations = max_iterations
        self.standardize = standardize
        self.seed = seed
        self.init = init
        self.max_runtime_secs = max_runtime_secs
        self.categorical_encoding = categorical_encoding
        self.export_checkpoints_dir = export_checkpoints_dir
        self.cluster_size_constraints = cluster_size_constraints

    @property
    def training_frame(self):
        """
        Id of the training data frame.

        Type: ``Union[None, str, H2OFrame]``.

        :examples:

        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
        >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
        ...               "PSA", "VOL", "GLEASON"]
        >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
        >>> pros_km = H2OKMeansEstimator(seed=1234)
        >>> pros_km.train(x=predictors,
        ...               training_frame=train,
        ...               validation_frame=valid)
        >>> pros_km.scoring_history()
        """
        return self._parms.get("training_frame")

    @training_frame.setter
    def training_frame(self, training_frame):
        self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')

    @property
    def validation_frame(self):
        """
        Id of the validation data frame.

        Type: ``Union[None, str, H2OFrame]``.

        :examples:

        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
        >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
        ...               "PSA", "VOL", "GLEASON"]
        >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
        >>> pros_km = H2OKMeansEstimator(seed=1234)
        >>> pros_km.train(x=predictors,
        ...               training_frame=train,
        ...               validation_frame=valid)
        >>> pros_km.scoring_history()
        """
        return self._parms.get("validation_frame")

    @validation_frame.setter
    def validation_frame(self, validation_frame):
        self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')

    @property
    def nfolds(self):
        """
        Number of folds for K-fold cross-validation (0 to disable or >= 2).

        Type: ``int``, defaults to ``0``.

        :examples:

        >>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
        >>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
        ...               "AGP1","AGMN","LIV","AGLP"]
        >>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
        >>> benign_km = H2OKMeansEstimator(nfolds=5, seed=1234)
        >>> benign_km.train(x=predictors,
        ...                 training_frame=train,
        ...                 validation_frame=valid)
        >>> benign_km.scoring_history()
        """
        return self._parms.get("nfolds")

    @nfolds.setter
    def nfolds(self, nfolds):
        assert_is_type(nfolds, None, int)
        self._parms["nfolds"] = nfolds

    @property
    def keep_cross_validation_models(self):
        """
        Whether to keep the cross-validation models.

        Type: ``bool``, defaults to ``True``.

        :examples:

        >>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
        >>> predictors = ["radiation","temperature","wind"]
        >>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
        >>> ozone_km = H2OKMeansEstimator(keep_cross_validation_models=True,
        ...                               nfolds=5,
        ...                               seed=1234)
        >>> ozone_km.train(x=predictors,
        ...                training_frame=train,
        ...                validation_frame=valid)
        >>> ozone_km.scoring_history()
        """
        return self._parms.get("keep_cross_validation_models")

    @keep_cross_validation_models.setter
    def keep_cross_validation_models(self, keep_cross_validation_models):
        assert_is_type(keep_cross_validation_models, None, bool)
        self._parms["keep_cross_validation_models"] = keep_cross_validation_models

    @property
    def keep_cross_validation_predictions(self):
        """
        Whether to keep the predictions of the cross-validation models.

        Type: ``bool``, defaults to ``False``.

        :examples:

        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
        >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
        ...               "PSA", "VOL", "GLEASON"]
        >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
        >>> pros_km = H2OKMeansEstimator(keep_cross_validation_predictions=True,
        ...                              nfolds=5,
        ...                              seed=1234)
        >>> pros_km.train(x=predictors,
        ...               training_frame=train,
        ...               validation_frame=valid)
        >>> pros_km.scoring_history()
        """
        return self._parms.get("keep_cross_validation_predictions")

    @keep_cross_validation_predictions.setter
    def keep_cross_validation_predictions(self, keep_cross_validation_predictions):
        assert_is_type(keep_cross_validation_predictions, None, bool)
        self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions

    @property
    def keep_cross_validation_fold_assignment(self):
        """
        Whether to keep the cross-validation fold assignment.

        Type: ``bool``, defaults to ``False``.

        :examples:

        >>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
        >>> predictors = ["radiation","temperature","wind"]
        >>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
        >>> ozone_km = H2OKMeansEstimator(keep_cross_validation_fold_assignment=True,
        ...                               nfolds=5,
        ...                               seed=1234)
        >>> ozone_km.train(x=predictors,
        ...                training_frame=train)
        >>> ozone_km.scoring_history()
        """
        return self._parms.get("keep_cross_validation_fold_assignment")

    @keep_cross_validation_fold_assignment.setter
    def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment):
        assert_is_type(keep_cross_validation_fold_assignment, None, bool)
        self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment

    @property
    def fold_assignment(self):
        """
        Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
        the folds based on the response variable, for classification problems.

        Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``.

        :examples:

        >>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
        >>> predictors = ["radiation","temperature","wind"]
        >>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
        >>> ozone_km = H2OKMeansEstimator(fold_assignment="Random",
        ...                               nfolds=5,
        ...                               seed=1234)
        >>> ozone_km.train(x=predictors,
        ...                training_frame=train,
        ...                validation_frame=valid)
        >>> ozone_km.scoring_history()
        """
        return self._parms.get("fold_assignment")

    @fold_assignment.setter
    def fold_assignment(self, fold_assignment):
        assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified"))
        self._parms["fold_assignment"] = fold_assignment

    @property
    def fold_column(self):
        """
        Column with cross-validation fold index assignment per observation.

        Type: ``str``.

        :examples:

        >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
        >>> predictors = ["displacement","power","weight","acceleration","year"]
        >>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234)
        >>> fold_numbers.set_names(["fold_numbers"])
        >>> cars = cars.cbind(fold_numbers)
        >>> print(cars['fold_numbers'])
        >>> cars_km = H2OKMeansEstimator(seed=1234)
        >>> cars_km.train(x=predictors,
        ...               training_frame=cars,
        ...               fold_column="fold_numbers")
        >>> cars_km.scoring_history()
        """
        return self._parms.get("fold_column")

    @fold_column.setter
    def fold_column(self, fold_column):
        assert_is_type(fold_column, None, str)
        self._parms["fold_column"] = fold_column

    @property
    def ignored_columns(self):
        """
        Names of columns to ignore for training.

        Type: ``List[str]``.
        """
        return self._parms.get("ignored_columns")

    @ignored_columns.setter
    def ignored_columns(self, ignored_columns):
        assert_is_type(ignored_columns, None, [str])
        self._parms["ignored_columns"] = ignored_columns

    @property
    def ignore_const_cols(self):
        """
        Ignore constant columns.

        Type: ``bool``, defaults to ``True``.

        :examples:

        >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
        >>> predictors = ["displacement","power","weight","acceleration","year"]
        >>> cars["const_1"] = 6
        >>> cars["const_2"] = 7
        >>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
        >>> cars_km = H2OKMeansEstimator(ignore_const_cols=True,
        ...                              seed=1234)
        >>> cars_km.train(x=predictors,
        ...               training_frame=train,
        ...               validation_frame=valid)
        >>> cars_km.scoring_history()
        """
        return self._parms.get("ignore_const_cols")

    @ignore_const_cols.setter
    def ignore_const_cols(self, ignore_const_cols):
        assert_is_type(ignore_const_cols, None, bool)
        self._parms["ignore_const_cols"] = ignore_const_cols

    @property
    def score_each_iteration(self):
        """
        Whether to score during each iteration of model training.

        Type: ``bool``, defaults to ``False``.

        :examples:

        >>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
        >>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
        ...               "AGP1","AGMN","LIV","AGLP"]
        >>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
        >>> benign_km = H2OKMeansEstimator(score_each_iteration=True,
        ...                                seed=1234)
        >>> benign_km.train(x=predictors,
        ...                 training_frame=train,
        ...                 validation_frame=valid)
        >>> benign_km.scoring_history()
        """
        return self._parms.get("score_each_iteration")

    @score_each_iteration.setter
    def score_each_iteration(self, score_each_iteration):
        assert_is_type(score_each_iteration, None, bool)
        self._parms["score_each_iteration"] = score_each_iteration

    @property
    def k(self):
        """
        The max. number of clusters. If estimate_k is disabled, the model will find k centroids, otherwise it will find
        up to k centroids.

        Type: ``int``, defaults to ``1``.

        :examples:

        >>> seeds = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/seeds_dataset.txt")
        >>> predictors = seeds.columns[0:7]
        >>> train, valid = seeds.split_frame(ratios=[.8], seed=1234)
        >>> seeds_km = H2OKMeansEstimator(k=3, seed=1234)
        >>> seeds_km.train(x=predictors,
        ...                training_frame=train,
        ...                validation_frame=valid)
        >>> seeds_km.scoring_history()
        """
        return self._parms.get("k")

    @k.setter
    def k(self, k):
        assert_is_type(k, None, int)
        self._parms["k"] = k

    @property
    def estimate_k(self):
        """
        Whether to estimate the number of clusters (<=k) iteratively and deterministically.

        Type: ``bool``, defaults to ``False``.

        :examples:

        >>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
        >>> iris['class'] = iris['class'].asfactor()
        >>> predictors = iris.columns[:-1]
        >>> train, valid = iris.split_frame(ratios=[.8], seed=1234)
        >>> iris_kmeans = H2OKMeansEstimator(k=10,
        ...                                  estimate_k=True,
        ...                                  standardize=False,
        ...                                  seed=1234)
        >>> iris_kmeans.train(x=predictors,
        ...                   training_frame=train,
        ...                   validation_frame=valid)
        >>> iris_kmeans.scoring_history()
        """
        return self._parms.get("estimate_k")

    @estimate_k.setter
    def estimate_k(self, estimate_k):
        assert_is_type(estimate_k, None, bool)
        self._parms["estimate_k"] = estimate_k

    @property
    def user_points(self):
        """
        This option allows you to specify a dataframe, where each row represents an initial cluster center. The user-
        specified points must have the same number of columns as the training observations. The number of rows must
        equal the number of clusters

        Type: ``Union[None, str, H2OFrame]``.

        :examples:

        >>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
        >>> iris['class'] = iris['class'].asfactor()
        >>> predictors = iris.columns[:-1]
        >>> train, valid = iris.split_frame(ratios=[.8], seed=1234)
        >>> point1 = [4.9,3.0,1.4,0.2]
        >>> point2 = [5.6,2.5,3.9,1.1]
        >>> point3 = [6.5,3.0,5.2,2.0]
        >>> points = h2o.H2OFrame([point1, point2, point3])
        >>> iris_km = H2OKMeansEstimator(k=3,
        ...                              user_points=points,
        ...                              seed=1234)
        >>> iris_km.train(x=predictors,
        ...               training_frame=iris,
        ...               validation_frame=valid)
        >>> iris_kmeans.tot_withinss(valid=True)
        """
        return self._parms.get("user_points")

    @user_points.setter
    def user_points(self, user_points):
        self._parms["user_points"] = H2OFrame._validate(user_points, 'user_points')

    @property
    def max_iterations(self):
        """
        Maximum training iterations (if estimate_k is enabled, then this is for each inner Lloyds iteration)

        Type: ``int``, defaults to ``10``.

        :examples:

        >>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
        >>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
        ...               "AGP1","AGMN","LIV","AGLP"]
        >>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
        >>> benign_km = H2OKMeansEstimator(max_iterations=50)
        >>> benign_km.train(x=predictors,
        ...                 training_frame=train,
        ...                 validation_frame=valid)
        >>> benign_km.scoring_history()
        """
        return self._parms.get("max_iterations")

    @max_iterations.setter
    def max_iterations(self, max_iterations):
        assert_is_type(max_iterations, None, int)
        self._parms["max_iterations"] = max_iterations

    @property
    def standardize(self):
        """
        Standardize columns before computing distances

        Type: ``bool``, defaults to ``True``.

        :examples:

        >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
        >>> predictors = boston.columns[:-1]
        >>> boston['chas'] = boston['chas'].asfactor()
        >>> train, valid = boston.split_frame(ratios=[.8])
        >>> boston_km = H2OKMeansEstimator(standardize=True)
        >>> boston_km.train(x=predictors,
        ...                 training_frame=train,
        ...                 validation_frame=valid)
        >>> boston_km.scoring_history()
        """
        return self._parms.get("standardize")

    @standardize.setter
    def standardize(self, standardize):
        assert_is_type(standardize, None, bool)
        self._parms["standardize"] = standardize

    @property
    def seed(self):
        """
        RNG Seed

        Type: ``int``, defaults to ``-1``.

        :examples:

        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
        >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
        >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
        >>> pros_w_seed = H2OKMeansEstimator(seed=1234)
        >>> pros_w_seed.train(x=predictors,
        ...                   training_frame=train,
        ...                   validation_frame=valid)
        >>> pros_wo_seed = H2OKMeansEstimator()
        >>> pros_wo_seed.train(x=predictors,
        ...                    training_frame=train,
        ...                    validation_frame=valid)
        >>> pros_w_seed.scoring_history()
        >>> pros_wo_seed.scoring_history()
        """
        return self._parms.get("seed")

    @seed.setter
    def seed(self, seed):
        assert_is_type(seed, None, int)
        self._parms["seed"] = seed

    @property
    def init(self):
        """
        Initialization mode

        Type: ``Literal["random", "plus_plus", "furthest", "user"]``, defaults to ``"furthest"``.

        :examples:

        >>> seeds = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/seeds_dataset.txt")
        >>> predictors = seeds.columns[0:7]
        >>> train, valid = seeds.split_frame(ratios=[.8], seed=1234)
        >>> seeds_km = H2OKMeansEstimator(k=3,
        ...                               init='Furthest',
        ...                               seed=1234)
        >>> seeds_km.train(x=predictors,
        ...                training_frame=train,
        ...                validation_frame= valid)
        >>> seeds_km.scoring_history()
        """
        return self._parms.get("init")

    @init.setter
    def init(self, init):
        assert_is_type(init, None, Enum("random", "plus_plus", "furthest", "user"))
        self._parms["init"] = init

    @property
    def max_runtime_secs(self):
        """
        Maximum allowed runtime in seconds for model training. Use 0 to disable.

        Type: ``float``, defaults to ``0.0``.

        :examples:

        >>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
        >>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
        ...               "AGP1","AGMN","LIV","AGLP"]
        >>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
        >>> benign_km = H2OKMeansEstimator(max_runtime_secs=10,
        ...                                seed=1234)
        >>> benign_km.train(x=predictors,
        ...                 training_frame=train,
        ...                 validation_frame=valid)
        >>> benign_km.scoring_history()
        """
        return self._parms.get("max_runtime_secs")

    @max_runtime_secs.setter
    def max_runtime_secs(self, max_runtime_secs):
        assert_is_type(max_runtime_secs, None, numeric)
        self._parms["max_runtime_secs"] = max_runtime_secs

    @property
    def categorical_encoding(self):
        """
        Encoding scheme for categorical features

        Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
        "sort_by_response", "enum_limited"]``, defaults to ``"auto"``.

        :examples:

        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
        >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
        >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
        >>> encoding = "one_hot_explicit"
        >>> pros_km = H2OKMeansEstimator(categorical_encoding=encoding,
        ...                              seed=1234)
        >>> pros_km.train(x=predictors,
        ...               training_frame=train,
        ...               validation_frame=valid)
        >>> pros_km.scoring_history()
        """
        return self._parms.get("categorical_encoding")

    @categorical_encoding.setter
    def categorical_encoding(self, categorical_encoding):
        assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
        self._parms["categorical_encoding"] = categorical_encoding

    @property
    def export_checkpoints_dir(self):
        """
        Automatically export generated models to this directory.

        Type: ``str``.

        :examples:

        >>> import tempfile
        >>> from os import listdir
        >>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
        >>> predictors = ["DayofMonth", "DayOfWeek"]
        >>> checkpoints_dir = tempfile.mkdtemp()
        >>> air_km = H2OKMeansEstimator(export_checkpoints_dir=checkpoints_dir,
        ...                             seed=1234)
        >>> air_km.train(x=predictors, training_frame=airlines)
        >>> len(listdir(checkpoints_dir))
        """
        return self._parms.get("export_checkpoints_dir")

    @export_checkpoints_dir.setter
    def export_checkpoints_dir(self, export_checkpoints_dir):
        assert_is_type(export_checkpoints_dir, None, str)
        self._parms["export_checkpoints_dir"] = export_checkpoints_dir

    @property
    def cluster_size_constraints(self):
        """
        An array specifying the minimum number of points that should be in each cluster. The length of the constraints
        array has to be the same as the number of clusters.

        Type: ``List[int]``.

        :examples:

        >>> iris_h2o = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv")
        >>> k=3
        >>> start_points = h2o.H2OFrame(
        ...         [[4.9, 3.0, 1.4, 0.2],
        ...          [5.6, 2.5, 3.9, 1.1],
        ...          [6.5, 3.0, 5.2, 2.0]])
        >>> kmm = H2OKMeansEstimator(k=k,
        ...                          user_points=start_points,
        ...                          standardize=True,
        ...                          cluster_size_constraints=[2, 5, 8],
        ...                          score_each_iteration=True)
        >>> kmm.train(x=list(range(7)), training_frame=iris_h2o)
        >>> kmm.scoring_history()
        """
        return self._parms.get("cluster_size_constraints")

    @cluster_size_constraints.setter
    def cluster_size_constraints(self, cluster_size_constraints):
        assert_is_type(cluster_size_constraints, None, [int])
        self._parms["cluster_size_constraints"] = cluster_size_constraints