1 Chapter 15, Screening Many Models

What if we don’t know which model to use?

1.1 Workflow Sets

data(concrete, package = "modeldata")
glimpse(concrete)
## Rows: 1,030
## Columns: 9
## $ cement               <dbl> 540.0, 540.0, 332.5, 332.5, 198.6, 266.0, 380.0, …
## $ blast_furnace_slag   <dbl> 0.0, 0.0, 142.5, 142.5, 132.4, 114.0, 95.0, 95.0,…
## $ fly_ash              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ water                <dbl> 162, 162, 228, 228, 192, 228, 228, 228, 228, 228,…
## $ superplasticizer     <dbl> 2.5, 2.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,…
## $ coarse_aggregate     <dbl> 1040.0, 1055.0, 932.0, 932.0, 978.4, 932.0, 932.0…
## $ fine_aggregate       <dbl> 676.0, 676.0, 594.0, 594.0, 825.5, 670.0, 594.0, …
## $ age                  <int> 28, 28, 270, 365, 360, 90, 365, 28, 28, 28, 90, 2…
## $ compressive_strength <dbl> 79.99, 61.89, 40.27, 41.05, 44.30, 47.03, 43.70, …
concrete <- 
   concrete %>% 
   group_by(across(-compressive_strength)) %>% 
   summarize(compressive_strength = mean(compressive_strength),
             .groups = "drop")
nrow(concrete)
## [1] 992
set.seed(1501)
concrete_split <- initial_split(concrete, strata = compressive_strength)
concrete_train <- training(concrete_split)
concrete_test  <- testing(concrete_split)

set.seed(1502)
concrete_folds_mini <- 
   vfold_cv(concrete_train, strata = compressive_strength, repeats=1, v=3)

normalized_rec <- 
   recipe(compressive_strength ~ ., data = concrete_train) %>% 
   step_normalize(all_predictors()) 

poly_recipe <- 
   normalized_rec %>% 
   step_poly(all_predictors()) %>% 
   step_interact(~ all_predictors():all_predictors())

Some models (notably neural networks, K-nearest neighbors, and support vector machines) require predictors that have been centered and scaled, so some model workflows will require recipes with these preprocessing steps. For other models, a traditional response surface design model expansion (i.e., quadratic and two-way interactions) is a good idea. For these purposes, we create two recipes:

library(rules)
## 
## Attaching package: 'rules'
## The following object is masked from 'package:dials':
## 
##     max_rules
library(baguette)

linear_reg_spec <- 
   linear_reg(penalty = tune(), mixture = tune()) %>% 
   set_engine("glmnet")

nnet_spec <- 
   mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) %>% 
   set_engine("nnet", MaxNWts = 2600) %>% 
   set_mode("regression")

mars_spec <- 
   mars(prod_degree = tune()) %>%  #<- use GCV to choose terms
   set_engine("earth") %>% 
   set_mode("regression")

svm_r_spec <- 
   svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
   set_engine("kernlab") %>% 
   set_mode("regression")

svm_p_spec <- 
   svm_poly(cost = tune(), degree = tune()) %>% 
   set_engine("kernlab") %>% 
   set_mode("regression")

knn_spec <- 
   nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune()) %>% 
   set_engine("kknn") %>% 
   set_mode("regression")

cart_spec <- 
   decision_tree(cost_complexity = tune(), min_n = tune()) %>% 
   set_engine("rpart") %>% 
   set_mode("regression")

bag_cart_spec <- 
   bag_tree() %>% 
   set_engine("rpart", times = 50L) %>% 
   set_mode("regression")

rf_spec <- 
   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
   set_engine("ranger") %>% 
   set_mode("regression")

xgb_spec <- 
   boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(), 
              min_n = tune(), sample_size = tune(), trees = tune()) %>% 
   set_engine("xgboost") %>% 
   set_mode("regression")

cubist_spec <- 
   cubist_rules(committees = tune(), neighbors = tune()) %>% 
   set_engine("Cubist") 

nnet_param <- 
   nnet_spec %>% 
   extract_parameter_set_dials() %>% 
   update(hidden_units = hidden_units(c(1, 27)))

As a first workflow set example, let’s combine the recipe that only standardizes the predictors to the nonlinear models that require that the predictors be in the same units.

normalized <- 
   workflow_set(
      preproc = list(normalized = normalized_rec), 
      models = list(SVM_radial = svm_r_spec, SVM_poly = svm_p_spec, 
                    KNN = knn_spec, neural_network = nnet_spec)
   )
normalized
normalized %>% extract_workflow(id = "normalized_KNN")
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K-Nearest Neighbor Model Specification (regression)
## 
## Main Arguments:
##   neighbors = tune()
##   weight_func = tune()
##   dist_power = tune()
## 
## Computational engine: kknn
normalized <- 
   normalized %>% 
   option_add(param_info = nnet_param, id = "normalized_neural_network")

For the other nonlinear models, let’s create another workflow set that uses dplyr selectors for the outcome and predictors:

model_vars <- 
   workflow_variables(outcomes = compressive_strength, 
                      predictors = everything())

no_pre_proc <- 
   workflow_set(
      preproc = list(simple = model_vars), 
      models = list(MARS = mars_spec, CART = cart_spec, CART_bagged = bag_cart_spec,
                    RF = rf_spec, boosting = xgb_spec, Cubist = cubist_spec)
   )
no_pre_proc

Finally, the set that uses nonlinear terms and interactions with the appropriate models are assembled:

with_features <- 
   workflow_set(
      preproc = list(full_quad = poly_recipe), 
      models = list(linear_reg = linear_reg_spec, KNN = knn_spec)
   )
all_workflows <- 
   bind_rows(no_pre_proc, normalized, with_features) %>% 
   # Make the workflow ID's a little more simple: 
   mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
all_workflows

1.2 Tuning

Almost all of these workflows contain tuning parameters. In order to evaluate their performance, we can use the standard tuning or resampling functions (e.g., tune_grid() and so on). The workflow_map() function will apply the same function to all of the workflows in the set; the default is tune_grid().

For this example, grid search is applied to each workflow using up to 25 different parameter candidates. There are a set of common options to use with each execution of tune_grid(). For example, we will use the same resampling and control objects for each workflow, along with a grid size of 25. The workflow_map() function has an additional argument called seed that is used to ensure that each execution of tune_grid() consumes the same random numbers.

grid_ctrl <-
   control_grid(
      save_pred = TRUE,
      parallel_over = "everything",
      save_workflow = TRUE
   )

grid_results <-
   all_workflows %>%
   workflow_map(
      seed = 1503,
      resamples = concrete_folds_mini,
      grid = 5,
      control = grid_ctrl,
      verbose = TRUE
   )
## i  1 of 12 tuning:     MARS
## ✔  1 of 12 tuning:     MARS (1.1s)
## i  2 of 12 tuning:     CART
## ✔  2 of 12 tuning:     CART (2.5s)
## i    No tuning parameters. `fit_resamples()` will be attempted
## i  3 of 12 resampling: CART_bagged
## ✔  3 of 12 resampling: CART_bagged (8.5s)
## i  4 of 12 tuning:     RF
## i Creating pre-processing data to finalize unknown parameter: mtry
## ✔  4 of 12 tuning:     RF (4.7s)
## i  5 of 12 tuning:     boosting
## ✔  5 of 12 tuning:     boosting (8.4s)
## i  6 of 12 tuning:     Cubist
## ✔  6 of 12 tuning:     Cubist (6.3s)
## i  7 of 12 tuning:     SVM_radial
## ✔  7 of 12 tuning:     SVM_radial (4.1s)
## i  8 of 12 tuning:     SVM_poly
## ✔  8 of 12 tuning:     SVM_poly (6.5s)
## i  9 of 12 tuning:     KNN
## ✔  9 of 12 tuning:     KNN (5.1s)
## i 10 of 12 tuning:     neural_network
## ✔ 10 of 12 tuning:     neural_network (7.4s)
## i 11 of 12 tuning:     full_quad_linear_reg
## ✔ 11 of 12 tuning:     full_quad_linear_reg (6.1s)
## i 12 of 12 tuning:     full_quad_KNN
## ✔ 12 of 12 tuning:     full_quad_KNN (36.5s)
num_grid_models <- nrow(collect_metrics(grid_results, summarize = FALSE))
grid_results %>% 
   rank_results() %>% 
   filter(.metric == "rmse") %>% 
   select(model, .config, rmse = mean, rank)
autoplot(
   grid_results,
   rank_metric = "rmse",  # <- how to order models
   metric = "rmse",       # <- which metric to visualize
   select_best = TRUE     # <- one point per workflow
) +
   geom_text(aes(y = mean - 1/2, label = wflow_id), angle = 90, hjust = 1) +
   lims(y = c(3.5, 9.5)) +
   theme(legend.position = "none")

1.3 Efficiently Screening Models

library(finetune)

concrete_folds <- 
   vfold_cv(concrete_train, strata = compressive_strength, repeats=1, v=5)

race_ctrl <-
   control_race(
      save_pred = TRUE,
      parallel_over = "everything",
      save_workflow = TRUE
   )

race_results <-
   all_workflows %>%
   workflow_map(
      "tune_race_anova",
      seed = 1503,
      resamples = concrete_folds,
      grid = 5,
      control = race_ctrl,
      verbose = TRUE
   )
## i  1 of 12 tuning:     MARS
## ✔  1 of 12 tuning:     MARS (2.7s)
## i  2 of 12 tuning:     CART
## ✔  2 of 12 tuning:     CART (3.2s)
## i    No tuning parameters. `fit_resamples()` will be attempted
## i  3 of 12 resampling: CART_bagged
## ✔  3 of 12 resampling: CART_bagged (13.9s)
## i  4 of 12 tuning:     RF
## i Creating pre-processing data to finalize unknown parameter: mtry
## ✔  4 of 12 tuning:     RF (7.4s)
## i  5 of 12 tuning:     boosting
## ✔  5 of 12 tuning:     boosting (16.7s)
## i  6 of 12 tuning:     Cubist
## ✔  6 of 12 tuning:     Cubist (9s)
## i  7 of 12 tuning:     SVM_radial
## ✔  7 of 12 tuning:     SVM_radial (4.7s)
## i  8 of 12 tuning:     SVM_poly
## ✔  8 of 12 tuning:     SVM_poly (10.6s)
## i  9 of 12 tuning:     KNN
## ✔  9 of 12 tuning:     KNN (8.1s)
## i 10 of 12 tuning:     neural_network
## ✔ 10 of 12 tuning:     neural_network (13.5s)
## i 11 of 12 tuning:     full_quad_linear_reg
## ✔ 11 of 12 tuning:     full_quad_linear_reg (6.9s)
## i 12 of 12 tuning:     full_quad_KNN
## ✔ 12 of 12 tuning:     full_quad_KNN (47.7s)
autoplot(
   race_results,
   rank_metric = "rmse",  
   metric = "rmse",       
   select_best = TRUE    
) +
   geom_text(aes(y = mean - 1/2, label = wflow_id), angle = 90, hjust = 1) +
   lims(y = c(3.0, 9.5)) +
   theme(legend.position = "none")

1.4 Finalize Model

best_results <- 
   race_results %>% 
   extract_workflow_set_result("boosting") %>% 
   select_best(metric = "rmse")
best_results
boosting_test_results <- 
   race_results %>% 
   extract_workflow("boosting") %>% 
   finalize_workflow(best_results) %>% 
   last_fit(split = concrete_split)
collect_metrics(boosting_test_results)
boosting_test_results %>% 
   collect_predictions() %>% 
   ggplot(aes(x = compressive_strength, y = .pred)) + 
   geom_abline(color = "gray50", lty = 2) + 
   geom_point(alpha = 0.5) + 
   coord_obs_pred() + 
   labs(x = "observed", y = "predicted")