library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames, colnames<-, ifelse, is.character,
##     is.factor, is.numeric, log, log10, log1p, log2, round, signif, trunc

h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 hours 43 minutes 
##     H2O cluster timezone:       Europe/Prague 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.33.1.99999 
##     H2O cluster version age:    3 hours and 22 minutes  
##     H2O cluster name:           H2O_from_python_tomasfryda_s3cfwf 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.50 GB 
##     H2O cluster total cores:    16 
##     H2O cluster allowed cores:  16 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.1.0 (2021-05-18)

h2o.no_progress()

df <- h2o.importFile("https://h2o-public-test-data.s3.amazonaws.com/smalldata/wine/winequality-redwhite-no-BOM.csv")

response <- "quality"

predictors <- c(
  "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide",
  "total sulfur dioxide", "density", "pH", "sulphates", "alcohol",  "type"
)


df_splits <- h2o.splitFrame(df, seed = 1)
train <- df_splits[[1]]
test <- df_splits[[2]]

aml <- h2o.automl(predictors, response, train, max_runtime_secs = 120)

h2o.explain(aml, test)

Leaderboard

Leaderboard shows models with their metrics. When provided with H2OAutoML object, the leaderboard shows 5-fold cross-validated metrics by default (depending on the H2OAutoML settings), otherwise it shows metrics computed on the newdata. At most 20 models are shown by default.

	model_id	mean_residual_deviance	rmse	mse	mae	rmsle	training_time_ms	predict_time_per_row_ms	algo
1	StackedEnsemble_AllModels_AutoML_17_20210827_140233	0.378531064176912	0.615248782344924	0.378531064176912	0.433103990743249	0.0933084718523573	816	0.033133	StackedEnsemble
2	StackedEnsemble_BestOfFamily_AutoML_17_20210827_140233	0.383373422954946	0.619171561810574	0.383373422954946	0.440589233458668	0.0938836994776484	375	0.013563	StackedEnsemble
3	DRF_1_AutoML_17_20210827_140233	0.389614577136847	0.624191138303683	0.389614577136847	0.451145509925678	0.0948359039280654	581	0.003999	DRF
4	XRT_1_AutoML_17_20210827_140233	0.38997535406883	0.624480066990796	0.38997535406883	0.451160214743767	0.0948727389691107	591	0.003586	DRF
5	GBM_grid__1_AutoML_17_20210827_140233_model_8	0.397645764554517	0.630591598861352	0.397645764554517	0.457507478873179	0.0955526270202405	476	0.0037	GBM
6	GBM_grid__1_AutoML_17_20210827_140233_model_9	0.399649011813872	0.63217799061172	0.399649011813872	0.432702823617246	0.0957687551441487	803	0.003731	GBM
7	GBM_grid__1_AutoML_17_20210827_140233_model_4	0.4015237630621	0.63365902744465	0.4015237630621	0.431574141176344	0.096131865275411	1099	0.003499	GBM
8	GBM_grid__1_AutoML_17_20210827_140233_model_10	0.402381639206972	0.634335588791116	0.402381639206972	0.441842573843571	0.0962205910070704	837	0.003541	GBM
9	XGBoost_grid__1_AutoML_17_20210827_140233_model_30	0.407901398049085	0.638671588572002	0.407901398049085	0.458854353758684	0.0964779966939156	904	0.001782	XGBoost
10	GBM_grid__1_AutoML_17_20210827_140233_model_15	0.408288968766192	0.638974935945215	0.408288968766192	0.462340019422283	0.0967962229730409	582	0.003866	GBM
11	XGBoost_grid__1_AutoML_17_20210827_140233_model_8	0.409250930205548	0.639727231095838	0.409250930205548	0.444005856362056	0.0968594172076455	977	0.001291	XGBoost
12	XGBoost_grid__1_AutoML_17_20210827_140233_model_7	0.40929736508185	0.639763522781543	0.40929736508185	0.431991330545706	0.0966856207614964	1214	0.0018	XGBoost
13	GBM_4_AutoML_17_20210827_140233	0.412878214497585	0.642556001059507	0.412878214497585	0.481460832697619	0.0972434947023142	291	0.003488	GBM
14	GBM_grid__1_AutoML_17_20210827_140233_model_6	0.413873057726262	0.643329664889054	0.413873057726262	0.476392014808579	0.0973326110819582	466	0.0043	GBM
15	GBM_grid__1_AutoML_17_20210827_140233_model_17	0.415337204062308	0.644466604303363	0.415337204062308	0.478885840820112	0.097532072498337	376	0.004507	GBM
16	XGBoost_grid__1_AutoML_17_20210827_140233_model_17	0.41574017395683	0.644779166813592	0.41574017395683	0.472842106177162	0.0972300673402773	644	0.00126	XGBoost
17	XGBoost_grid__1_AutoML_17_20210827_140233_model_10	0.421987967311321	0.649606009294342	0.421987967311321	0.478120817015865	0.0980947286628003	1761	0.001728	XGBoost
18	XGBoost_grid__1_AutoML_17_20210827_140233_model_12	0.423484422243504	0.650756807297092	0.423484422243504	0.448807256194817	0.0983203133505553	577	0.001128	XGBoost
19	GBM_grid__1_AutoML_17_20210827_140233_model_7	0.424335508548659	0.651410399478439	0.424335508548659	0.48557004956668	0.0985103465297768	343	0.004725	GBM
20	XGBoost_grid__1_AutoML_17_20210827_140233_model_22	0.424395168440037	0.651456190729689	0.424395168440037	0.448771715115178	0.0984834728914042	992	0.001443	XGBoost

Residual Analysis

Residual Analysis plots the fitted values vs residuals on a test dataset. Ideally, residuals should be randomly distributed. Patterns in this plot can indicate potential problems with the model selection, e.g., using simpler model than necessary, not accounting for heteroscedasticity, autocorrelation, etc. Note that if you see “striped” lines of residuals, that is an artifact of having an integer valued (vs a real valued) response variable.

Variable Importance

The variable importance plot shows the relative importance of the most important variables in the model.

Variable Importance Heatmap

Variable importance heatmap shows variable importance across multiple models. Some models in H2O return variable importance for one-hot (binary indicator) encoded versions of categorical columns (e.g. Deep Learning, XGBoost). In order for the variable importance of categorical columns to be compared across all model types we compute a summarization of the the variable importance across all one-hot encoded features and return a single variable importance for the original categorical feature. By default, the models and variables are ordered by their similarity.

Model Correlation

This plot shows the correlation between the predictions of the models. For classification, frequency of identical predictions is used. By default, models are ordered by their similarity (as computed by hierarchical clustering).

Interpretable models: GLM_1_AutoML_17_20210827_140233

SHAP Summary

SHAP summary plot shows the contribution of the features for each instance (row of data). The sum of the feature contributions and the bias term is equal to the raw prediction of the model, i.e., prediction before applying inverse link function.

Partial Dependence Plots

Partial dependence plot (PDP) gives a graphical depiction of the marginal effect of a variable on the response. The effect of a variable is measured in change in the mean response. PDP assumes independence between the feature for which is the PDP computed and the rest.

Individual Conditional Expectations

An Individual Conditional Expectation (ICE) plot gives a graphical depiction of the marginal effect of a variable on the response. ICE plots are similar to partial dependence plots (PDP); PDP shows the average effect of a feature while ICE plot shows the effect for a single instance. This function will plot the effect for each decile. In contrast to the PDP, ICE plots can provide more insight, especially when there is stronger feature interaction.

h2o.explain_row(aml, test, row_index = 42)

## Warning in xtfrm.data.frame(x): cannot xtfrm data frames

Leaderboard

Leaderboard shows models with their metrics and their predictions for a given row. When provided with H2OAutoML object, the leaderboard shows 5-fold cross-validated metrics by default (depending on the H2OAutoML settings), otherwise it shows metrics computed on the newdata. At most 20 models are shown by default.

	model_id	mean_residual_deviance	rmse	mse	mae	rmsle	training_time_ms	predict_time_per_row_ms	algo	predict
1	StackedEnsemble_AllModels_AutoML_17_20210827_140233	0.378531064176912	0.615248782344924	0.378531064176912	0.433103990743249	0.0933084718523573	816	0.033133	StackedEnsemble	4.49460650267171
2	StackedEnsemble_BestOfFamily_AutoML_17_20210827_140233	0.383373422954946	0.619171561810574	0.383373422954946	0.440589233458668	0.0938836994776484	375	0.013563	StackedEnsemble	4.57537591048141
3	DRF_1_AutoML_17_20210827_140233	0.389614577136847	0.624191138303683	0.389614577136847	0.451145509925678	0.0948359039280654	581	0.003999	DRF	4.69883334159851
4	XRT_1_AutoML_17_20210827_140233	0.38997535406883	0.624480066990796	0.38997535406883	0.451160214743767	0.0948727389691107	591	0.003586	DRF	4.72471182081434
5	GBM_grid__1_AutoML_17_20210827_140233_model_8	0.397645764554517	0.630591598861352	0.397645764554517	0.457507478873179	0.0955526270202405	476	0.0037	GBM	4.72629083950354
6	GBM_grid__1_AutoML_17_20210827_140233_model_9	0.399649011813872	0.63217799061172	0.399649011813872	0.432702823617246	0.0957687551441487	803	0.003731	GBM	4.19156470805077
7	GBM_grid__1_AutoML_17_20210827_140233_model_4	0.4015237630621	0.63365902744465	0.4015237630621	0.431574141176344	0.096131865275411	1099	0.003499	GBM	4.48189188027512
8	GBM_grid__1_AutoML_17_20210827_140233_model_10	0.402381639206972	0.634335588791116	0.402381639206972	0.441842573843571	0.0962205910070704	837	0.003541	GBM	4.68003533053465
9	XGBoost_grid__1_AutoML_17_20210827_140233_model_30	0.407901398049085	0.638671588572002	0.407901398049085	0.458854353758684	0.0964779966939156	904	0.001782	XGBoost	4.75063419342041
10	GBM_grid__1_AutoML_17_20210827_140233_model_15	0.408288968766192	0.638974935945215	0.408288968766192	0.462340019422283	0.0967962229730409	582	0.003866	GBM	4.55504415963272
11	XGBoost_grid__1_AutoML_17_20210827_140233_model_8	0.409250930205548	0.639727231095838	0.409250930205548	0.444005856362056	0.0968594172076455	977	0.001291	XGBoost	4.4625997543335
12	XGBoost_grid__1_AutoML_17_20210827_140233_model_7	0.40929736508185	0.639763522781543	0.40929736508185	0.431991330545706	0.0966856207614964	1214	0.0018	XGBoost	4.44016742706299
13	GBM_4_AutoML_17_20210827_140233	0.412878214497585	0.642556001059507	0.412878214497585	0.481460832697619	0.0972434947023142	291	0.003488	GBM	5.06502689804742
14	GBM_grid__1_AutoML_17_20210827_140233_model_6	0.413873057726262	0.643329664889054	0.413873057726262	0.476392014808579	0.0973326110819582	466	0.0043	GBM	4.79658177756149
15	GBM_grid__1_AutoML_17_20210827_140233_model_17	0.415337204062308	0.644466604303363	0.415337204062308	0.478885840820112	0.097532072498337	376	0.004507	GBM	4.8431209562344
16	XGBoost_grid__1_AutoML_17_20210827_140233_model_17	0.41574017395683	0.644779166813592	0.41574017395683	0.472842106177162	0.0972300673402773	644	0.00126	XGBoost	4.79575252532959
17	XGBoost_grid__1_AutoML_17_20210827_140233_model_10	0.421987967311321	0.649606009294342	0.421987967311321	0.478120817015865	0.0980947286628003	1761	0.001728	XGBoost	5.11099004745483
18	XGBoost_grid__1_AutoML_17_20210827_140233_model_12	0.423484422243504	0.650756807297092	0.423484422243504	0.448807256194817	0.0983203133505553	577	0.001128	XGBoost	4.45458555221558
19	GBM_grid__1_AutoML_17_20210827_140233_model_7	0.424335508548659	0.651410399478439	0.424335508548659	0.48557004956668	0.0985103465297768	343	0.004725	GBM	5.00271929352797
20	XGBoost_grid__1_AutoML_17_20210827_140233_model_22	0.424395168440037	0.651456190729689	0.424395168440037	0.448771715115178	0.0984834728914042	992	0.001443	XGBoost	4.62553024291992

SHAP explanation

SHAP explanation shows contribution of features for a given instance. The sum of the feature contributions and the bias term is equal to the raw prediction of the model, i.e., prediction before applying inverse link function. H2O implements TreeSHAP which when the features are correlated, can increase contribution of a feature that had no influence on the prediction.

Explain Wine Example

Leaderboard

Residual Analysis

Variable Importance

Variable Importance Heatmap

Model Correlation

SHAP Summary

Partial Dependence Plots

Individual Conditional Expectations

Leaderboard

SHAP explanation