#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details)
#
from __future__ import absolute_import, division, print_function, unicode_literals
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric
[docs]class H2OKMeansEstimator(H2OEstimator):
"""
K-means
Performs k-means clustering on an H2O dataset.
"""
algo = "kmeans"
param_names = {"model_id", "training_frame", "validation_frame", "nfolds", "keep_cross_validation_models",
"keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "fold_assignment",
"fold_column", "ignored_columns", "ignore_const_cols", "score_each_iteration", "k", "estimate_k",
"user_points", "max_iterations", "standardize", "seed", "init", "max_runtime_secs",
"categorical_encoding", "export_checkpoints_dir", "cluster_size_constraints"}
def __init__(self, **kwargs):
super(H2OKMeansEstimator, self).__init__()
self._parms = {}
for pname, pvalue in kwargs.items():
if pname == 'model_id':
self._id = pvalue
self._parms["model_id"] = pvalue
elif pname in self.param_names:
# Using setattr(...) will invoke type-checking of the arguments
setattr(self, pname, pvalue)
else:
raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
@property
def training_frame(self):
"""
Id of the training data frame.
Type: ``H2OFrame``.
:examples:
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
... "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> pros_km = H2OKMeansEstimator(seed=1234)
>>> pros_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> pros_km.scoring_history()
"""
return self._parms.get("training_frame")
@training_frame.setter
def training_frame(self, training_frame):
self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
@property
def validation_frame(self):
"""
Id of the validation data frame.
Type: ``H2OFrame``.
:examples:
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
... "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> pros_km = H2OKMeansEstimator(seed=1234)
>>> pros_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> pros_km.scoring_history()
"""
return self._parms.get("validation_frame")
@validation_frame.setter
def validation_frame(self, validation_frame):
self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
@property
def nfolds(self):
"""
Number of folds for K-fold cross-validation (0 to disable or >= 2).
Type: ``int`` (default: ``0``).
:examples:
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
... "AGP1","AGMN","LIV","AGLP"]
>>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
>>> benign_km = H2OKMeansEstimator(nfolds=5, seed=1234)
>>> benign_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> benign_km.scoring_history()
"""
return self._parms.get("nfolds")
@nfolds.setter
def nfolds(self, nfolds):
assert_is_type(nfolds, None, int)
self._parms["nfolds"] = nfolds
@property
def keep_cross_validation_models(self):
"""
Whether to keep the cross-validation models.
Type: ``bool`` (default: ``True``).
:examples:
>>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
>>> predictors = ["radiation","temperature","wind"]
>>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
>>> ozone_km = H2OKMeansEstimator(keep_cross_validation_models=True,
... nfolds=5,
... seed=1234)
>>> ozone_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> ozone_km.scoring_history()
"""
return self._parms.get("keep_cross_validation_models")
@keep_cross_validation_models.setter
def keep_cross_validation_models(self, keep_cross_validation_models):
assert_is_type(keep_cross_validation_models, None, bool)
self._parms["keep_cross_validation_models"] = keep_cross_validation_models
@property
def keep_cross_validation_predictions(self):
"""
Whether to keep the predictions of the cross-validation models.
Type: ``bool`` (default: ``False``).
:examples:
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS",
... "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> pros_km = H2OKMeansEstimator(keep_cross_validation_predictions=True,
... nfolds=5,
... seed=1234)
>>> pros_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> pros_km.scoring_history()
"""
return self._parms.get("keep_cross_validation_predictions")
@keep_cross_validation_predictions.setter
def keep_cross_validation_predictions(self, keep_cross_validation_predictions):
assert_is_type(keep_cross_validation_predictions, None, bool)
self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions
@property
def keep_cross_validation_fold_assignment(self):
"""
Whether to keep the cross-validation fold assignment.
Type: ``bool`` (default: ``False``).
:examples:
>>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
>>> predictors = ["radiation","temperature","wind"]
>>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
>>> ozone_km = H2OKMeansEstimator(keep_cross_validation_fold_assignment=True,
... nfolds=5,
... seed=1234)
>>> ozone_km.train(x=predictors,
... training_frame=train)
>>> ozone_km.scoring_history()
"""
return self._parms.get("keep_cross_validation_fold_assignment")
@keep_cross_validation_fold_assignment.setter
def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment):
assert_is_type(keep_cross_validation_fold_assignment, None, bool)
self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment
@property
def fold_assignment(self):
"""
Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
the folds based on the response variable, for classification problems.
One of: ``"auto"``, ``"random"``, ``"modulo"``, ``"stratified"`` (default: ``"auto"``).
:examples:
>>> ozone = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/ozone.csv")
>>> predictors = ["radiation","temperature","wind"]
>>> train, valid = ozone.split_frame(ratios=[.8], seed=1234)
>>> ozone_km = H2OKMeansEstimator(fold_assignment="Random",
... nfolds=5,
... seed=1234)
>>> ozone_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> ozone_km.scoring_history()
"""
return self._parms.get("fold_assignment")
@fold_assignment.setter
def fold_assignment(self, fold_assignment):
assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified"))
self._parms["fold_assignment"] = fold_assignment
@property
def fold_column(self):
"""
Column with cross-validation fold index assignment per observation.
Type: ``str``.
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234)
>>> fold_numbers.set_names(["fold_numbers"])
>>> cars = cars.cbind(fold_numbers)
>>> print(cars['fold_numbers'])
>>> cars_km = H2OKMeansEstimator(seed=1234)
>>> cars_km.train(x=predictors,
... training_frame=cars,
... fold_column="fold_numbers")
>>> cars_km.scoring_history()
"""
return self._parms.get("fold_column")
@fold_column.setter
def fold_column(self, fold_column):
assert_is_type(fold_column, None, str)
self._parms["fold_column"] = fold_column
@property
def ignored_columns(self):
"""
Names of columns to ignore for training.
Type: ``List[str]``.
"""
return self._parms.get("ignored_columns")
@ignored_columns.setter
def ignored_columns(self, ignored_columns):
assert_is_type(ignored_columns, None, [str])
self._parms["ignored_columns"] = ignored_columns
@property
def ignore_const_cols(self):
"""
Ignore constant columns.
Type: ``bool`` (default: ``True``).
:examples:
>>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
>>> predictors = ["displacement","power","weight","acceleration","year"]
>>> cars["const_1"] = 6
>>> cars["const_2"] = 7
>>> train, valid = cars.split_frame(ratios=[.8], seed=1234)
>>> cars_km = H2OKMeansEstimator(ignore_const_cols=True,
... seed=1234)
>>> cars_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> cars_km.scoring_history()
"""
return self._parms.get("ignore_const_cols")
@ignore_const_cols.setter
def ignore_const_cols(self, ignore_const_cols):
assert_is_type(ignore_const_cols, None, bool)
self._parms["ignore_const_cols"] = ignore_const_cols
@property
def score_each_iteration(self):
"""
Whether to score during each iteration of model training.
Type: ``bool`` (default: ``False``).
:examples:
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
... "AGP1","AGMN","LIV","AGLP"]
>>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
>>> benign_km = H2OKMeansEstimator(score_each_iteration=True,
... seed=1234)
>>> benign_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> benign_km.scoring_history()
"""
return self._parms.get("score_each_iteration")
@score_each_iteration.setter
def score_each_iteration(self, score_each_iteration):
assert_is_type(score_each_iteration, None, bool)
self._parms["score_each_iteration"] = score_each_iteration
@property
def k(self):
"""
The max. number of clusters. If estimate_k is disabled, the model will find k centroids, otherwise it will find
up to k centroids.
Type: ``int`` (default: ``1``).
:examples:
>>> seeds = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/seeds_dataset.txt")
>>> predictors = seeds.columns[0:7]
>>> train, valid = seeds.split_frame(ratios=[.8], seed=1234)
>>> seeds_km = H2OKMeansEstimator(k=3, seed=1234)
>>> seeds_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> seeds_km.scoring_history()
"""
return self._parms.get("k")
@k.setter
def k(self, k):
assert_is_type(k, None, int)
self._parms["k"] = k
@property
def estimate_k(self):
"""
Whether to estimate the number of clusters (<=k) iteratively and deterministically.
Type: ``bool`` (default: ``False``).
:examples:
>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris['class'] = iris['class'].asfactor()
>>> predictors = iris.columns[:-1]
>>> train, valid = iris.split_frame(ratios=[.8], seed=1234)
>>> iris_kmeans = H2OKMeansEstimator(k=10,
... estimate_k=True,
... standardize=False,
... seed=1234)
>>> iris_kmeans.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> iris_kmeans.scoring_history()
"""
return self._parms.get("estimate_k")
@estimate_k.setter
def estimate_k(self, estimate_k):
assert_is_type(estimate_k, None, bool)
self._parms["estimate_k"] = estimate_k
@property
def user_points(self):
"""
This option allows you to specify a dataframe, where each row represents an initial cluster center. The user-
specified points must have the same number of columns as the training observations. The number of rows must
equal the number of clusters
Type: ``H2OFrame``.
:examples:
>>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
>>> iris['class'] = iris['class'].asfactor()
>>> predictors = iris.columns[:-1]
>>> train, valid = iris.split_frame(ratios=[.8], seed=1234)
>>> point1 = [4.9,3.0,1.4,0.2]
>>> point2 = [5.6,2.5,3.9,1.1]
>>> point3 = [6.5,3.0,5.2,2.0]
>>> points = h2o.H2OFrame([point1, point2, point3])
>>> iris_km = H2OKMeansEstimator(k=3,
... user_points=points,
... seed=1234)
>>> iris_km.train(x=predictors,
... training_frame=iris,
... validation_frame=valid)
>>> iris_kmeans.tot_withinss(valid=True)
"""
return self._parms.get("user_points")
@user_points.setter
def user_points(self, user_points):
self._parms["user_points"] = H2OFrame._validate(user_points, 'user_points')
@property
def max_iterations(self):
"""
Maximum training iterations (if estimate_k is enabled, then this is for each inner Lloyds iteration)
Type: ``int`` (default: ``10``).
:examples:
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
... "AGP1","AGMN","LIV","AGLP"]
>>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
>>> benign_km = H2OKMeansEstimator(max_iterations=50)
>>> benign_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> benign_km.scoring_history()
"""
return self._parms.get("max_iterations")
@max_iterations.setter
def max_iterations(self, max_iterations):
assert_is_type(max_iterations, None, int)
self._parms["max_iterations"] = max_iterations
@property
def standardize(self):
"""
Standardize columns before computing distances
Type: ``bool`` (default: ``True``).
:examples:
>>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv")
>>> predictors = boston.columns[:-1]
>>> boston['chas'] = boston['chas'].asfactor()
>>> train, valid = boston.split_frame(ratios=[.8])
>>> boston_km = H2OKMeansEstimator(standardize=True)
>>> boston_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> boston_km.scoring_history()
"""
return self._parms.get("standardize")
@standardize.setter
def standardize(self, standardize):
assert_is_type(standardize, None, bool)
self._parms["standardize"] = standardize
@property
def seed(self):
"""
RNG Seed
Type: ``int`` (default: ``-1``).
:examples:
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> pros_w_seed = H2OKMeansEstimator(seed=1234)
>>> pros_w_seed.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> pros_wo_seed = H2OKMeansEstimator()
>>> pros_wo_seed.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> pros_w_seed.scoring_history()
>>> pros_wo_seed.scoring_history()
"""
return self._parms.get("seed")
@seed.setter
def seed(self, seed):
assert_is_type(seed, None, int)
self._parms["seed"] = seed
@property
def init(self):
"""
Initialization mode
One of: ``"random"``, ``"plus_plus"``, ``"furthest"``, ``"user"`` (default: ``"furthest"``).
:examples:
>>> seeds = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/seeds_dataset.txt")
>>> predictors = seeds.columns[0:7]
>>> train, valid = seeds.split_frame(ratios=[.8], seed=1234)
>>> seeds_km = H2OKMeansEstimator(k=3,
... init='Furthest',
... seed=1234)
>>> seeds_km.train(x=predictors,
... training_frame=train,
... validation_frame= valid)
>>> seeds_km.scoring_history()
"""
return self._parms.get("init")
@init.setter
def init(self, init):
assert_is_type(init, None, Enum("random", "plus_plus", "furthest", "user"))
self._parms["init"] = init
@property
def max_runtime_secs(self):
"""
Maximum allowed runtime in seconds for model training. Use 0 to disable.
Type: ``float`` (default: ``0``).
:examples:
>>> benign = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/benign.csv")
>>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK",
... "AGP1","AGMN","LIV","AGLP"]
>>> train, valid = benign.split_frame(ratios=[.8], seed=1234)
>>> benign_km = H2OKMeansEstimator(max_runtime_secs=10,
... seed=1234)
>>> benign_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> benign_km.scoring_history()
"""
return self._parms.get("max_runtime_secs")
@max_runtime_secs.setter
def max_runtime_secs(self, max_runtime_secs):
assert_is_type(max_runtime_secs, None, numeric)
self._parms["max_runtime_secs"] = max_runtime_secs
@property
def categorical_encoding(self):
"""
Encoding scheme for categorical features
One of: ``"auto"``, ``"enum"``, ``"one_hot_internal"``, ``"one_hot_explicit"``, ``"binary"``, ``"eigen"``,
``"label_encoder"``, ``"sort_by_response"``, ``"enum_limited"`` (default: ``"auto"``).
:examples:
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
>>> train, valid = prostate.split_frame(ratios=[.8], seed=1234)
>>> encoding = "one_hot_explicit"
>>> pros_km = H2OKMeansEstimator(categorical_encoding=encoding,
... seed=1234)
>>> pros_km.train(x=predictors,
... training_frame=train,
... validation_frame=valid)
>>> pros_km.scoring_history()
"""
return self._parms.get("categorical_encoding")
@categorical_encoding.setter
def categorical_encoding(self, categorical_encoding):
assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
self._parms["categorical_encoding"] = categorical_encoding
@property
def export_checkpoints_dir(self):
"""
Automatically export generated models to this directory.
Type: ``str``.
:examples:
>>> import tempfile
>>> from os import listdir
>>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex")
>>> predictors = ["DayofMonth", "DayOfWeek"]
>>> checkpoints_dir = tempfile.mkdtemp()
>>> air_km = H2OKMeansEstimator(export_checkpoints_dir=checkpoints_dir,
... seed=1234)
>>> air_km.train(x=predictors, training_frame=airlines)
>>> len(listdir(checkpoints_dir))
"""
return self._parms.get("export_checkpoints_dir")
@export_checkpoints_dir.setter
def export_checkpoints_dir(self, export_checkpoints_dir):
assert_is_type(export_checkpoints_dir, None, str)
self._parms["export_checkpoints_dir"] = export_checkpoints_dir
@property
def cluster_size_constraints(self):
"""
An array specifying the minimum number of points that should be in each cluster. The length of the constraints
array has to be the same as the number of clusters.
Type: ``List[int]``.
:examples:
>>> iris_h2o = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv")
>>> k=3
>>> start_points = h2o.H2OFrame(
... [[4.9, 3.0, 1.4, 0.2],
... [5.6, 2.5, 3.9, 1.1],
... [6.5, 3.0, 5.2, 2.0]])
>>> kmm = H2OKMeansEstimator(k=k,
... user_points=start_points,
... standardize=True,
... cluster_size_constraints=[2, 5, 8],
... score_each_iteration=True)
>>> kmm.train(x=list(range(7)), training_frame=iris_h2o)
>>> kmm.scoring_history()
"""
return self._parms.get("cluster_size_constraints")
@cluster_size_constraints.setter
def cluster_size_constraints(self, cluster_size_constraints):
assert_is_type(cluster_size_constraints, None, [int])
self._parms["cluster_size_constraints"] = cluster_size_constraints