transform

  • Available in: GLRM, PCA, Aggregator

  • Hyperparameter: yes

Description

Use the transform parameter to specify the transformation method used for numeric columns in the training data. Available options include:

  • None: Do not perform any transformations on the data.

  • Standardize: Standardizing subtracts the mean and then divides each variable by its standard deviation.

  • Normalize: Scales all numeric variables in the range [0,1].

  • Demean: The mean for each variable is subtracting from each observation resulting in mean zero. Note that it is not always advisable to demean the data if the Moving Average parameter is of primary interest to estimate.

  • Descale: Divides by the standard deviation of each column.

In PCA and GLRM, this value defaults to None. In Aggregator, this value defaults to Normalize.

Example

library(h2o)
h2o.init()

# Load the Birds dataset
birds.hex <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/birds.csv")

# Train using Standardized transform
birds.pca <- h2o.prcomp(training_frame = birds.hex, transform = "STANDARDIZE",
                        k = 3, pca_method="Power", use_all_factor_levels=TRUE,
                        impute_missing=TRUE)

# View the importance of components
birds.pca@model$importance
Importance of components:
                            pc1      pc2      pc3
Standard deviation     1.496991 1.351000 1.014182
Proportion of Variance 0.289987 0.236184 0.133098
Cumulative Proportion  0.289987 0.526171 0.659269

# View the eigenvectors
birds.pca@model$eigenvectors
Rotation:
                  pc1      pc2       pc3
patch.Ref1a  0.007207 0.007449  0.001161
patch.Ref1b -0.003090 0.011257 -0.001066
patch.Ref1c  0.002962 0.008850 -0.000264
patch.Ref1d -0.001295 0.011003  0.000501
patch.Ref1e  0.006559 0.006904 -0.001206

---
                pc1       pc2       pc3
S          0.463591 -0.053410  0.184799
year      -0.055934  0.009691 -0.968635
area       0.533375 -0.289381 -0.130338
log.area.  0.583966 -0.262287 -0.089582
ENN       -0.270615 -0.573900  0.038835
log.ENN.  -0.231368 -0.640231  0.026325

# Train again using Normalize transform
birds2.pca <- h2o.prcomp(training_frame = birds.hex, transform = "NORMALIZE",
                         k = 3, pca_method="Power", use_all_factor_levels=TRUE,
                         impute_missing=TRUE)

# View the importance of components
birds2.pca@model$importance
Importance of components:
                            pc1      pc2      pc3
Standard deviation     0.632015 0.531616 0.517096
Proportion of Variance 0.166444 0.117764 0.111418
Cumulative Proportion  0.166444 0.284208 0.395626

# View the eigenvectors
birds2.pca@model$eigenvectors
Rotation:
                pc1       pc2       pc3
patch.Ref1a 0.026631 -0.006839 0.008674
patch.Ref1b 0.025825 -0.010199 0.004386
patch.Ref1c 0.026240 -0.008322 0.006759
patch.Ref1d 0.026106 -0.009375 0.005472
patch.Ref1e 0.026313 -0.007510 0.007769

---
                pc1       pc2       pc3
S          0.055295  0.113531  0.141168
year      -0.003343 -0.013812 -0.019785
area      -0.011008  0.064146  0.087213
log.area.  0.007378  0.080143  0.086986
ENN       -0.151652 -0.026572 -0.013064
log.ENN.  -0.463210 -0.046953  0.086169
import(h2o)
h2o.init()
from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator

# Load the Birds dataset
birds = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/birds.csv")

# Train with the Power pca_method
birds.pca = H2OPrincipalComponentAnalysisEstimator(k = 3, transform = "STANDARDIZE", pca_method="Power",
                   use_all_factor_levels=True, impute_missing=True)
birds.pca.train(x=list(range(4)), training_frame=birds)

# View the importance of components
birds.pca.varimp(use_pandas=False)
[(u'Standard deviation', 1.0505993078459912, 0.8950182545325247, 0.5587566783073901),
(u'Proportion of Variance', 0.28699613488673914, 0.20828865401845226, 0.08117966990084355),
(u'Cumulative Proportion', 0.28699613488673914, 0.4952847889051914, 0.5764644588060349)]

# View the eigenvectors
birds.pca.rotation()
Rotation:
                   pc1                 pc2                pc3
-----------------  ------------------  -----------------  ----------------
patch.Ref1a        0.00732398141913    -0.0141576160836   0.0294419461081
patch.Ref1b        -0.00482860843905   0.00867426840498   0.0330778190153
patch.Ref1c        0.00124768649004    -0.00274167383932  0.0312598825617
patch.Ref1d        -0.000370181920761  0.000297923901103  0.0317439245635
patch.Ref1e        0.00223394447742    -0.00459462277502  0.0309648089406
---                ---                 ---                ---
landscape.Bauxite  -0.0638494513759    0.136728811833     0.118858152002
landscape.Forest   0.0378085502606     -0.0833578672691   0.969316569884
landscape.Urban    -0.0545759062856    0.111309410422     0.0354475756223
S                  0.564501605704      -0.767095710638    -0.0466832766991
year               -0.814596906726     -0.577331674836    -0.0101626722479

# See the whole table with table.as_data_frame()

# Train again using Normalize transform
birds2 = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/birds.csv")
birds2.pca = H2OPrincipalComponentAnalysisEstimator(k = 3, transform = "NORMALIZE", pca_method="Power",
                    use_all_factor_levels=True, impute_missing=True)
birds2.pca.train(x=list(range(4)), training_frame=birds2)

# View the importance of components
birds2.pca.varimp(use_pandas=False)
[(u'Standard deviation', 0.5615959368803389, 0.527199563812311, 0.5094397597133178),
(u'Proportion of Variance', 0.14220176282406302, 0.12531618081504411, 0.11701532412044723),
(u'Cumulative Proportion', 0.14220176282406302, 0.26751794363910714, 0.3845332677595544)]

# View the eigenvectors
birds2.pca.rotation()
Rotation:
                   pc1                pc2                pc3
-----------------  -----------------  -----------------  -----------------
patch.Ref1a        0.0321402336467    -5.67047495074e-05  0.000466136314122
patch.Ref1b        0.0312293374798    -0.00233972080607   -0.00219708018283
patch.Ref1c        0.0316847855632    -0.00119821277779   -0.000865471934357
patch.Ref1d        0.0315635183971    -0.00150214960133   -0.00122002465866
patch.Ref1e        0.0317587104328    -0.00101293187492   -0.000649335409312
---                ---                ---                 ---
landscape.Bauxite  -0.0276965008223   -0.962683908867     0.166590998707
landscape.Forest   0.982163161865     -0.0373079859488    -0.0270202298116
landscape.Urban    -0.00873355942469  -0.0280626855484    -0.0394249459161
S                  0.0515403663478    0.113344870593      0.123141154399
year               -0.00488342003667  -0.0143717060558    -0.0187277019153

# See the whole table with table.as_data_frame()