in_training_checkpoints_tree_interval

  • Available in: GBM

  • Hyperparameter: no

Description

The in_training_checkpoints_tree_interval option specifies after how many trees to checkpoint the model. This option is useful when you would like to reduce the size of in_training_checkpoints_dir.

Example

library(h2o)
h2o.init()

# import the prostate dataset:
prostate = h2o.importFile("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")

# set the predictors, response, and categorical features:
prostate$RACE <- as.factor(prostate$RACE)
prostate$CAPSULE <- as.factor(prostate$CAPSULE)
predictors <- c("ID", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON")
response <- "CAPSULE"

# specify directory for training checkpoints:
checkpoints_dir <- "checkpoints-interval"

# train the model and provide checkpoints in training process:
pros_gbm <- h2o.gbm(x = predictors,
                    y = response,
                    model_id = "gbm-model",
                    ntrees = 10,
                    seed = 1111,
                    training_frame = prostate,
                    in_training_checkpoints_dir = checkpoints_dir,
                    in_training_checkpoints_tree_interval=2)

# retrieve the number of files in the exported checkpoints directory:
num_files <- length(list.files(checkpoints_dir))
num_files # 4
# import necessary modules:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import tempfile
from os import listdir, path

# start h2o:
h2o.init()

# import the prostate dataset:
prostate = h2o.import_file(path="http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")

# set the predictors, response, and categorical features:
prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
prostate["RACE"] = prostate["RACE"].asfactor()
predictors = ["ID", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
response = "CAPSULE"

# specify directory for training checkpoints:
checkpoints_dir = tempfile.mkdtemp()

# Build and train the model and provide checkpoints in training process:
pros_gbm = H2OGradientBoostingEstimator(model_id="gbm_model",
                                        ntrees=10,
                                        seed=1111,
                                        in_training_checkpoints_dir=checkpoints_dir,
                                        in_training_checkpoints_tree_interval=2)

pros_gbm.train(x=predictors, y=response, training_frame=prostate)

# retrieve the number of files in the exported checkpoints directory:
checkpoints = listdir(checkpoints_dir)
print(checkpoints)
num_files = len(listdir(checkpoints_dir))
print(num_files) # 4