Source code for h2o.estimators.pca

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OPrincipalComponentAnalysisEstimator(H2OEstimator):
    """
    Principal Components Analysis

    """

    algo = "pca"
    supervised_learning = False

    def __init__(self,
                 model_id=None,  # type: Optional[Union[None, str, H2OEstimator]]
                 training_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 validation_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 ignored_columns=None,  # type: Optional[List[str]]
                 ignore_const_cols=True,  # type: bool
                 score_each_iteration=False,  # type: bool
                 transform="none",  # type: Literal["none", "standardize", "normalize", "demean", "descale"]
                 pca_method="gram_s_v_d",  # type: Literal["gram_s_v_d", "power", "randomized", "glrm"]
                 pca_impl=None,  # type: Optional[Literal["mtj_evd_densematrix", "mtj_evd_symmmatrix", "mtj_svd_densematrix", "jama"]]
                 k=1,  # type: int
                 max_iterations=1000,  # type: int
                 use_all_factor_levels=False,  # type: bool
                 compute_metrics=True,  # type: bool
                 impute_missing=False,  # type: bool
                 seed=-1,  # type: int
                 max_runtime_secs=0.0,  # type: float
                 export_checkpoints_dir=None,  # type: Optional[str]
                 ):
        """
        :param model_id: Destination id for this model; auto-generated if not specified.
               Defaults to ``None``.
        :type model_id: Union[None, str, H2OEstimator], optional
        :param training_frame: Id of the training data frame.
               Defaults to ``None``.
        :type training_frame: Union[None, str, H2OFrame], optional
        :param validation_frame: Id of the validation data frame.
               Defaults to ``None``.
        :type validation_frame: Union[None, str, H2OFrame], optional
        :param ignored_columns: Names of columns to ignore for training.
               Defaults to ``None``.
        :type ignored_columns: List[str], optional
        :param ignore_const_cols: Ignore constant columns.
               Defaults to ``True``.
        :type ignore_const_cols: bool
        :param score_each_iteration: Whether to score during each iteration of model training.
               Defaults to ``False``.
        :type score_each_iteration: bool
        :param transform: Transformation of training data
               Defaults to ``"none"``.
        :type transform: Literal["none", "standardize", "normalize", "demean", "descale"]
        :param pca_method: Specify the algorithm to use for computing the principal components: GramSVD - uses a
               distributed computation of the Gram matrix, followed by a local SVD; Power - computes the SVD using the
               power iteration method (experimental); Randomized - uses randomized subspace iteration method; GLRM -
               fits a generalized low-rank model with L2 loss function and no regularization and solves for the SVD
               using local matrix algebra (experimental)
               Defaults to ``"gram_s_v_d"``.
        :type pca_method: Literal["gram_s_v_d", "power", "randomized", "glrm"]
        :param pca_impl: Specify the implementation to use for computing PCA (via SVD or EVD): MTJ_EVD_DENSEMATRIX -
               eigenvalue decompositions for dense matrix using MTJ; MTJ_EVD_SYMMMATRIX - eigenvalue decompositions for
               symmetric matrix using MTJ; MTJ_SVD_DENSEMATRIX - singular-value decompositions for dense matrix using
               MTJ; JAMA - eigenvalue decompositions for dense matrix using JAMA. References: JAMA -
               http://math.nist.gov/javanumerics/jama/; MTJ - https://github.com/fommil/matrix-toolkits-java/
               Defaults to ``None``.
        :type pca_impl: Literal["mtj_evd_densematrix", "mtj_evd_symmmatrix", "mtj_svd_densematrix", "jama"], optional
        :param k: Rank of matrix approximation
               Defaults to ``1``.
        :type k: int
        :param max_iterations: Maximum training iterations
               Defaults to ``1000``.
        :type max_iterations: int
        :param use_all_factor_levels: Whether first factor level is included in each categorical expansion
               Defaults to ``False``.
        :type use_all_factor_levels: bool
        :param compute_metrics: Whether to compute metrics on the training data
               Defaults to ``True``.
        :type compute_metrics: bool
        :param impute_missing: Whether to impute missing entries with the column mean
               Defaults to ``False``.
        :type impute_missing: bool
        :param seed: RNG seed for initialization
               Defaults to ``-1``.
        :type seed: int
        :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
               Defaults to ``0.0``.
        :type max_runtime_secs: float
        :param export_checkpoints_dir: Automatically export generated models to this directory.
               Defaults to ``None``.
        :type export_checkpoints_dir: str, optional
        """
        super(H2OPrincipalComponentAnalysisEstimator, self).__init__()
        self._parms = {}
        self._id = self._parms['model_id'] = model_id
        self.training_frame = training_frame
        self.validation_frame = validation_frame
        self.ignored_columns = ignored_columns
        self.ignore_const_cols = ignore_const_cols
        self.score_each_iteration = score_each_iteration
        self.transform = transform
        self.pca_method = pca_method
        self.pca_impl = pca_impl
        self.k = k
        self.max_iterations = max_iterations
        self.use_all_factor_levels = use_all_factor_levels
        self.compute_metrics = compute_metrics
        self.impute_missing = impute_missing
        self.seed = seed
        self.max_runtime_secs = max_runtime_secs
        self.export_checkpoints_dir = export_checkpoints_dir

    @property
    def training_frame(self):
        """
        Id of the training data frame.

        Type: ``Union[None, str, H2OFrame]``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator()
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("training_frame")

    @training_frame.setter
    def training_frame(self, training_frame):
        self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')

    @property
    def validation_frame(self):
        """
        Id of the validation data frame.

        Type: ``Union[None, str, H2OFrame]``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> train, valid = data.split_frame(ratios=[.8], seed=1234)
        >>> model_pca = H2OPrincipalComponentAnalysisEstimator(impute_missing=True)
        >>> model_pca.train(x=data.names,
        ...                training_frame=train,
        ...                validation_frame=valid)
        >>> model_pca.show()
        """
        return self._parms.get("validation_frame")

    @validation_frame.setter
    def validation_frame(self, validation_frame):
        self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')

    @property
    def ignored_columns(self):
        """
        Names of columns to ignore for training.

        Type: ``List[str]``.
        """
        return self._parms.get("ignored_columns")

    @ignored_columns.setter
    def ignored_columns(self, ignored_columns):
        assert_is_type(ignored_columns, None, [str])
        self._parms["ignored_columns"] = ignored_columns

    @property
    def ignore_const_cols(self):
        """
        Ignore constant columns.

        Type: ``bool``, defaults to ``True``.

        :examples:

        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
        >>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
        >>> prostate['RACE'] = prostate['RACE'].asfactor()
        >>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
        >>> prostate['DPROS'] = prostate['DPROS'].asfactor()
        >>> pros_pca = H2OPrincipalComponentAnalysisEstimator(ignore_const_cols=False)
        >>> pros_pca.train(x=prostate.names, training_frame=prostate)
        >>> pros_pca.show()
        """
        return self._parms.get("ignore_const_cols")

    @ignore_const_cols.setter
    def ignore_const_cols(self, ignore_const_cols):
        assert_is_type(ignore_const_cols, None, bool)
        self._parms["ignore_const_cols"] = ignore_const_cols

    @property
    def score_each_iteration(self):
        """
        Whether to score during each iteration of model training.

        Type: ``bool``, defaults to ``False``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=3,
        ...                                                   score_each_iteration=True,
        ...                                                   seed=1234,
        ...                                                   impute_missing=True)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("score_each_iteration")

    @score_each_iteration.setter
    def score_each_iteration(self, score_each_iteration):
        assert_is_type(score_each_iteration, None, bool)
        self._parms["score_each_iteration"] = score_each_iteration

    @property
    def transform(self):
        """
        Transformation of training data

        Type: ``Literal["none", "standardize", "normalize", "demean", "descale"]``, defaults to ``"none"``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
        ...                                                   transform="standardize",
        ...                                                   pca_method="power",
        ...                                                   impute_missing=True,
        ...                                                   max_iterations=800)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("transform")

    @transform.setter
    def transform(self, transform):
        assert_is_type(transform, None, Enum("none", "standardize", "normalize", "demean", "descale"))
        self._parms["transform"] = transform

    @property
    def pca_method(self):
        """
        Specify the algorithm to use for computing the principal components: GramSVD - uses a distributed computation of
        the Gram matrix, followed by a local SVD; Power - computes the SVD using the power iteration method
        (experimental); Randomized - uses randomized subspace iteration method; GLRM - fits a generalized low-rank model
        with L2 loss function and no regularization and solves for the SVD using local matrix algebra (experimental)

        Type: ``Literal["gram_s_v_d", "power", "randomized", "glrm"]``, defaults to ``"gram_s_v_d"``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
        ...                                                   transform="standardize",
        ...                                                   pca_method="power",
        ...                                                   impute_missing=True,
        ...                                                   max_iterations=800)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("pca_method")

    @pca_method.setter
    def pca_method(self, pca_method):
        assert_is_type(pca_method, None, Enum("gram_s_v_d", "power", "randomized", "glrm"))
        self._parms["pca_method"] = pca_method

    @property
    def pca_impl(self):
        """
        Specify the implementation to use for computing PCA (via SVD or EVD): MTJ_EVD_DENSEMATRIX - eigenvalue
        decompositions for dense matrix using MTJ; MTJ_EVD_SYMMMATRIX - eigenvalue decompositions for symmetric matrix
        using MTJ; MTJ_SVD_DENSEMATRIX - singular-value decompositions for dense matrix using MTJ; JAMA - eigenvalue
        decompositions for dense matrix using JAMA. References: JAMA - http://math.nist.gov/javanumerics/jama/; MTJ -
        https://github.com/fommil/matrix-toolkits-java/

        Type: ``Literal["mtj_evd_densematrix", "mtj_evd_symmmatrix", "mtj_svd_densematrix", "jama"]``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=3,
        ...                                                   pca_impl="jama",
        ...                                                   impute_missing=True,
        ...                                                   max_iterations=1200)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("pca_impl")

    @pca_impl.setter
    def pca_impl(self, pca_impl):
        assert_is_type(pca_impl, None, Enum("mtj_evd_densematrix", "mtj_evd_symmmatrix", "mtj_svd_densematrix", "jama"))
        self._parms["pca_impl"] = pca_impl

    @property
    def k(self):
        """
        Rank of matrix approximation

        Type: ``int``, defaults to ``1``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
        ...                                                   transform="standardize",
        ...                                                   pca_method="power",
        ...                                                   impute_missing=True,
        ...                                                   max_iterations=800)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("k")

    @k.setter
    def k(self, k):
        assert_is_type(k, None, int)
        self._parms["k"] = k

    @property
    def max_iterations(self):
        """
        Maximum training iterations

        Type: ``int``, defaults to ``1000``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
        ...                                                   transform="standardize",
        ...                                                   pca_method="power",
        ...                                                   impute_missing=True,
        ...                                                   max_iterations=800)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("max_iterations")

    @max_iterations.setter
    def max_iterations(self, max_iterations):
        assert_is_type(max_iterations, None, int)
        self._parms["max_iterations"] = max_iterations

    @property
    def use_all_factor_levels(self):
        """
        Whether first factor level is included in each categorical expansion

        Type: ``bool``, defaults to ``False``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=3,
        ...                                                   use_all_factor_levels=True,
        ...                                                   seed=1234)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("use_all_factor_levels")

    @use_all_factor_levels.setter
    def use_all_factor_levels(self, use_all_factor_levels):
        assert_is_type(use_all_factor_levels, None, bool)
        self._parms["use_all_factor_levels"] = use_all_factor_levels

    @property
    def compute_metrics(self):
        """
        Whether to compute metrics on the training data

        Type: ``bool``, defaults to ``True``.

        :examples:

        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
        >>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
        >>> prostate['RACE'] = prostate['RACE'].asfactor()
        >>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
        >>> prostate['DPROS'] = prostate['DPROS'].asfactor()
        >>> pros_pca = H2OPrincipalComponentAnalysisEstimator(compute_metrics=False)
        >>> pros_pca.train(x=prostate.names, training_frame=prostate)
        >>> pros_pca.show()
        """
        return self._parms.get("compute_metrics")

    @compute_metrics.setter
    def compute_metrics(self, compute_metrics):
        assert_is_type(compute_metrics, None, bool)
        self._parms["compute_metrics"] = compute_metrics

    @property
    def impute_missing(self):
        """
        Whether to impute missing entries with the column mean

        Type: ``bool``, defaults to ``False``.

        :examples:

        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
        >>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
        >>> prostate['RACE'] = prostate['RACE'].asfactor()
        >>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
        >>> prostate['DPROS'] = prostate['DPROS'].asfactor()
        >>> pros_pca = H2OPrincipalComponentAnalysisEstimator(impute_missing=True)
        >>> pros_pca.train(x=prostate.names, training_frame=prostate)
        >>> pros_pca.show()
        """
        return self._parms.get("impute_missing")

    @impute_missing.setter
    def impute_missing(self, impute_missing):
        assert_is_type(impute_missing, None, bool)
        self._parms["impute_missing"] = impute_missing

    @property
    def seed(self):
        """
        RNG seed for initialization

        Type: ``int``, defaults to ``-1``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=3,
        ...                                                   seed=1234,
        ...                                                   impute_missing=True)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("seed")

    @seed.setter
    def seed(self, seed):
        assert_is_type(seed, None, int)
        self._parms["seed"] = seed

    @property
    def max_runtime_secs(self):
        """
        Maximum allowed runtime in seconds for model training. Use 0 to disable.

        Type: ``float``, defaults to ``0.0``.

        :examples:

        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/SDSS_quasar.txt.zip")
        >>> data_pca = H2OPrincipalComponentAnalysisEstimator(k=-1,
        ...                                                   transform="standardize",
        ...                                                   pca_method="power",
        ...                                                   impute_missing=True,
        ...                                                   max_iterations=800
        ...                                                   max_runtime_secs=15)
        >>> data_pca.train(x=data.names, training_frame=data)
        >>> data_pca.show()
        """
        return self._parms.get("max_runtime_secs")

    @max_runtime_secs.setter
    def max_runtime_secs(self, max_runtime_secs):
        assert_is_type(max_runtime_secs, None, numeric)
        self._parms["max_runtime_secs"] = max_runtime_secs

    @property
    def export_checkpoints_dir(self):
        """
        Automatically export generated models to this directory.

        Type: ``str``.

        :examples:

        >>> import tempfile
        >>> from os import listdir
        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip")
        >>> prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
        >>> prostate['RACE'] = prostate['RACE'].asfactor()
        >>> prostate['DCAPS'] = prostate['DCAPS'].asfactor()
        >>> prostate['DPROS'] = prostate['DPROS'].asfactor()
        >>> checkpoints_dir = tempfile.mkdtemp()
        >>> pros_pca = H2OPrincipalComponentAnalysisEstimator(impute_missing=True,
        ...                                                   export_checkpoints_dir=checkpoints_dir)
        >>> pros_pca.train(x=prostate.names, training_frame=prostate)
        >>> len(listdir(checkpoints_dir))
        """
        return self._parms.get("export_checkpoints_dir")

    @export_checkpoints_dir.setter
    def export_checkpoints_dir(self, export_checkpoints_dir):
        assert_is_type(export_checkpoints_dir, None, str)
        self._parms["export_checkpoints_dir"] = export_checkpoints_dir


[docs]    def init_for_pipeline(self):
        """
        Returns H2OPCA object which implements fit and transform method to be used in sklearn.Pipeline properly.
        All parameters defined in self.__params, should be input parameters in H2OPCA.__init__ method.

        :returns: H2OPCA object

        :examples:

        >>> from sklearn.pipeline import Pipeline
        >>> from h2o.transforms.preprocessing import H2OScaler
        >>> from h2o.estimators import H2ORandomForestEstimator
        >>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
        >>> pipe = Pipeline([("standardize", H2OScaler()),
        ...                  ("pca", H2OPrincipalComponentAnalysisEstimator(k=2).init_for_pipeline()),
        ...                  ("rf", H2ORandomForestEstimator(seed=42,ntrees=5))])
        >>> pipe.fit(iris[:4], iris[4])
        """
        import inspect
        from h2o.transforms.decomposition import H2OPCA
        # check which parameters can be passed to H2OPCA init
        var_names = list(dict(inspect.getmembers(H2OPCA.__init__.__code__))['co_varnames'])
        parameters = {k: v for k, v in self._parms.items() if k in var_names}
        return H2OPCA(**parameters)