impute_missing
¶
Available in: PCA
Hyperparameter: no
Description¶
In some cases, dataset used can contain a fewer number of rows due to the removal of rows with NA/missing values. If this is not the desired behavior, then you can use the impute_missing
option to impute missing entries in each column with the column mean value.
This value defaults to False.
Example¶
library(h2o)
h2o.init()
# Load the Birds dataset
birds.hex <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/birds.csv")
# Train with impute_missing enabled
birds.pca <- h2o.prcomp(training_frame = birds.hex, transform = "STANDARDIZE",
k = 3, pca_method="Power", use_all_factor_levels=TRUE,
impute_missing=TRUE)
# View the importance of components
birds.pca@model$importance
Importance of components:
pc1 pc2 pc3
Standard deviation 1.496991 1.351000 1.014182
Proportion of Variance 0.289987 0.236184 0.133098
Cumulative Proportion 0.289987 0.526171 0.659269
# View the eigenvectors
birds.pca@model$eigenvectors
Rotation:
pc1 pc2 pc3
patch.Ref1a 0.007207 0.007449 0.001161
patch.Ref1b -0.003090 0.011257 -0.001066
patch.Ref1c 0.002962 0.008850 -0.000264
patch.Ref1d -0.001295 0.011003 0.000501
patch.Ref1e 0.006559 0.006904 -0.001206
---
pc1 pc2 pc3
S 0.463591 -0.053410 0.184799
year -0.055934 0.009691 -0.968635
area 0.533375 -0.289381 -0.130338
log.area. 0.583966 -0.262287 -0.089582
ENN -0.270615 -0.573900 0.038835
log.ENN. -0.231368 -0.640231 0.026325
# Train again without imputing missing values
birds2.pca <- h2o.prcomp(training_frame = birds.hex, transform = "STANDARDIZE",
k = 3, pca_method="Power", use_all_factor_levels=TRUE,
impute_missing=FALSE)
Warning message:
In doTryCatch(return(expr), name, parentenv, handler) :
_train: Dataset used may contain fewer number of rows due to removal of rows
with NA/missing values. If this is not desirable, set impute_missing argument
in pca call to TRUE/True/true/... depending on the client language.
# View the importance of components
birds2.pca@model$importance
Importance of components:
pc1 pc2 pc3
Standard deviation 1.546397 1.348276 1.055239
Proportion of Variance 0.300269 0.228258 0.139820
Cumulative Proportion 0.300269 0.528527 0.668347
# View the eigenvectors
birds2.pca@model$eigenvectors
Rotation:
pc1 pc2 pc3
patch.Ref1a 0.009848 -0.005947 -0.001061
patch.Ref1b -0.001628 -0.014739 -0.001007
patch.Ref1c 0.004994 -0.009486 -0.000523
patch.Ref1d 0.000117 -0.004400 -0.004917
patch.Ref1e 0.003627 -0.001467 -0.004268
---
pc1 pc2 pc3
S 0.515048 0.226915 -0.123136
year -0.066269 -0.069526 0.971250
area 0.414050 0.344332 0.149339
log.area. 0.497313 0.363609 0.131261
ENN -0.390235 0.545631 -0.007944
log.ENN. -0.345665 0.562834 -0.002092
import(h2o)
h2o.init()
from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
# Load the Birds dataset
birds = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/birds.csv")
# Train with impute_missing enabled
birds.pca = H2OPrincipalComponentAnalysisEstimator(k = 3, transform = "STANDARDIZE", pca_method="Power",
use_all_factor_levels=True, impute_missing=True)
birds.pca.train(x=list(range(4)), training_frame=birds)
# View the importance of components
birds.pca.varimp(use_pandas=False)
[(u'Standard deviation', 1.0505993078459912, 0.8950182545325247, 0.5587566783073901),
(u'Proportion of Variance', 0.28699613488673914, 0.20828865401845226, 0.08117966990084355),
(u'Cumulative Proportion', 0.28699613488673914, 0.4952847889051914, 0.5764644588060349)]
# View the eigenvectors
birds.pca.rotation()
Rotation:
pc1 pc2 pc3
----------------- ------------------ ----------------- ----------------
patch.Ref1a 0.00732398141913 -0.0141576160836 0.0294419461081
patch.Ref1b -0.00482860843905 0.00867426840498 0.0330778190153
patch.Ref1c 0.00124768649004 -0.00274167383932 0.0312598825617
patch.Ref1d -0.000370181920761 0.000297923901103 0.0317439245635
patch.Ref1e 0.00223394447742 -0.00459462277502 0.0309648089406
--- --- --- ---
landscape.Bauxite -0.0638494513759 0.136728811833 0.118858152002
landscape.Forest 0.0378085502606 -0.0833578672691 0.969316569884
landscape.Urban -0.0545759062856 0.111309410422 0.0354475756223
S 0.564501605704 -0.767095710638 -0.0466832766991
year -0.814596906726 -0.577331674836 -0.0101626722479
See the whole table with table.as_data_frame()
# Train again without imputing missing values
birds2 = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/birds.csv")
birds2.pca = H2OPrincipalComponentAnalysisEstimator(k = 3, transform = "STANDARDIZE",
pca_method="Power", use_all_factor_levels=True,
impute_missing=False)
birds2.pca.train(x=list(range(4)), training_frame=birds2)
# View the importance of components
birds2.pca.varimp(use_pandas=False)
[(u'Standard deviation', 1.1238486420242524, 0.949554306091356, 0.534896629598228),
(u'Proportion of Variance', 0.3080623966646966, 0.21991895069672512, 0.06978510918460899),
(u'Cumulative Proportion', 0.3080623966646966, 0.5279813473614217, 0.5977664565460307)]
# View the eigenvectors
birds2.pca.rotation()
Rotation:
pc1 pc2 pc3
----------------- ----------------- ----------------- -----------------
patch.Ref1a 0.00898674970716 0.0133755203176 0.0386887315027
patch.Ref1b -0.00583910665399 -0.00850852817775 0.0403921679996
patch.Ref1c 0.00157382152659 0.00243349606991 0.0395404497512
patch.Ref1d 0.00205431391489 -0.00464763108225 0.0130225730145
patch.Ref1e 0.00521157104675 9.98792622547e-07 0.0126676559841
--- --- --- ---
landscape.Bauxite -0.0927064158093 -0.0985077050027 0.312254932996
landscape.Forest 0.049803344754 0.0606680349608 0.928822693132
landscape.Urban -0.0671561320808 -0.108679950396 0.033639706807
S 0.661206203315 0.69412159594 -0.0166591571667
year -0.727793152951 0.684904477663 -0.00409291536614
See the whole table with table.as_data_frame()