What if we don’t know which model to use?
data(concrete, package = "modeldata")
glimpse(concrete)
## Rows: 1,030
## Columns: 9
## $ cement <dbl> 540.0, 540.0, 332.5, 332.5, 198.6, 266.0, 380.0, …
## $ blast_furnace_slag <dbl> 0.0, 0.0, 142.5, 142.5, 132.4, 114.0, 95.0, 95.0,…
## $ fly_ash <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ water <dbl> 162, 162, 228, 228, 192, 228, 228, 228, 228, 228,…
## $ superplasticizer <dbl> 2.5, 2.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,…
## $ coarse_aggregate <dbl> 1040.0, 1055.0, 932.0, 932.0, 978.4, 932.0, 932.0…
## $ fine_aggregate <dbl> 676.0, 676.0, 594.0, 594.0, 825.5, 670.0, 594.0, …
## $ age <int> 28, 28, 270, 365, 360, 90, 365, 28, 28, 28, 90, 2…
## $ compressive_strength <dbl> 79.99, 61.89, 40.27, 41.05, 44.30, 47.03, 43.70, …
concrete <-
concrete %>%
group_by(across(-compressive_strength)) %>%
summarize(compressive_strength = mean(compressive_strength),
.groups = "drop")
nrow(concrete)
## [1] 992
set.seed(1501)
concrete_split <- initial_split(concrete, strata = compressive_strength)
concrete_train <- training(concrete_split)
concrete_test <- testing(concrete_split)
set.seed(1502)
concrete_folds_mini <-
vfold_cv(concrete_train, strata = compressive_strength, repeats=1, v=3)
normalized_rec <-
recipe(compressive_strength ~ ., data = concrete_train) %>%
step_normalize(all_predictors())
poly_recipe <-
normalized_rec %>%
step_poly(all_predictors()) %>%
step_interact(~ all_predictors():all_predictors())
Some models (notably neural networks, K-nearest neighbors, and support vector machines) require predictors that have been centered and scaled, so some model workflows will require recipes with these preprocessing steps. For other models, a traditional response surface design model expansion (i.e., quadratic and two-way interactions) is a good idea. For these purposes, we create two recipes:
library(rules)
##
## Attaching package: 'rules'
## The following object is masked from 'package:dials':
##
## max_rules
library(baguette)
linear_reg_spec <-
linear_reg(penalty = tune(), mixture = tune()) %>%
set_engine("glmnet")
nnet_spec <-
mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) %>%
set_engine("nnet", MaxNWts = 2600) %>%
set_mode("regression")
mars_spec <-
mars(prod_degree = tune()) %>% #<- use GCV to choose terms
set_engine("earth") %>%
set_mode("regression")
svm_r_spec <-
svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
set_engine("kernlab") %>%
set_mode("regression")
svm_p_spec <-
svm_poly(cost = tune(), degree = tune()) %>%
set_engine("kernlab") %>%
set_mode("regression")
knn_spec <-
nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune()) %>%
set_engine("kknn") %>%
set_mode("regression")
cart_spec <-
decision_tree(cost_complexity = tune(), min_n = tune()) %>%
set_engine("rpart") %>%
set_mode("regression")
bag_cart_spec <-
bag_tree() %>%
set_engine("rpart", times = 50L) %>%
set_mode("regression")
rf_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression")
xgb_spec <-
boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(),
min_n = tune(), sample_size = tune(), trees = tune()) %>%
set_engine("xgboost") %>%
set_mode("regression")
cubist_spec <-
cubist_rules(committees = tune(), neighbors = tune()) %>%
set_engine("Cubist")
nnet_param <-
nnet_spec %>%
extract_parameter_set_dials() %>%
update(hidden_units = hidden_units(c(1, 27)))
As a first workflow set example, let’s combine the recipe that only standardizes the predictors to the nonlinear models that require that the predictors be in the same units.
normalized <-
workflow_set(
preproc = list(normalized = normalized_rec),
models = list(SVM_radial = svm_r_spec, SVM_poly = svm_p_spec,
KNN = knn_spec, neural_network = nnet_spec)
)
normalized
normalized %>% extract_workflow(id = "normalized_KNN")
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K-Nearest Neighbor Model Specification (regression)
##
## Main Arguments:
## neighbors = tune()
## weight_func = tune()
## dist_power = tune()
##
## Computational engine: kknn
normalized <-
normalized %>%
option_add(param_info = nnet_param, id = "normalized_neural_network")
For the other nonlinear models, let’s create another workflow set that uses dplyr selectors for the outcome and predictors:
model_vars <-
workflow_variables(outcomes = compressive_strength,
predictors = everything())
no_pre_proc <-
workflow_set(
preproc = list(simple = model_vars),
models = list(MARS = mars_spec, CART = cart_spec, CART_bagged = bag_cart_spec,
RF = rf_spec, boosting = xgb_spec, Cubist = cubist_spec)
)
no_pre_proc
Finally, the set that uses nonlinear terms and interactions with the appropriate models are assembled:
with_features <-
workflow_set(
preproc = list(full_quad = poly_recipe),
models = list(linear_reg = linear_reg_spec, KNN = knn_spec)
)
all_workflows <-
bind_rows(no_pre_proc, normalized, with_features) %>%
# Make the workflow ID's a little more simple:
mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
all_workflows
Almost all of these workflows contain tuning parameters. In order to
evaluate their performance, we can use the standard tuning or resampling
functions (e.g., tune_grid()
and so on). The
workflow_map()
function will apply the same function to all
of the workflows in the set; the default is
tune_grid()
.
For this example, grid search is applied to each workflow using up to
25 different parameter candidates. There are a set of common options to
use with each execution of tune_grid()
. For example, we
will use the same resampling and control objects for each workflow,
along with a grid size of 25. The workflow_map()
function
has an additional argument called seed
that is used to
ensure that each execution of tune_grid()
consumes the same
random numbers.
grid_ctrl <-
control_grid(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
grid_results <-
all_workflows %>%
workflow_map(
seed = 1503,
resamples = concrete_folds_mini,
grid = 5,
control = grid_ctrl,
verbose = TRUE
)
## i 1 of 12 tuning: MARS
## ✔ 1 of 12 tuning: MARS (1.1s)
## i 2 of 12 tuning: CART
## ✔ 2 of 12 tuning: CART (2.5s)
## i No tuning parameters. `fit_resamples()` will be attempted
## i 3 of 12 resampling: CART_bagged
## ✔ 3 of 12 resampling: CART_bagged (8.5s)
## i 4 of 12 tuning: RF
## i Creating pre-processing data to finalize unknown parameter: mtry
## ✔ 4 of 12 tuning: RF (4.7s)
## i 5 of 12 tuning: boosting
## ✔ 5 of 12 tuning: boosting (8.4s)
## i 6 of 12 tuning: Cubist
## ✔ 6 of 12 tuning: Cubist (6.3s)
## i 7 of 12 tuning: SVM_radial
## ✔ 7 of 12 tuning: SVM_radial (4.1s)
## i 8 of 12 tuning: SVM_poly
## ✔ 8 of 12 tuning: SVM_poly (6.5s)
## i 9 of 12 tuning: KNN
## ✔ 9 of 12 tuning: KNN (5.1s)
## i 10 of 12 tuning: neural_network
## ✔ 10 of 12 tuning: neural_network (7.4s)
## i 11 of 12 tuning: full_quad_linear_reg
## ✔ 11 of 12 tuning: full_quad_linear_reg (6.1s)
## i 12 of 12 tuning: full_quad_KNN
## ✔ 12 of 12 tuning: full_quad_KNN (36.5s)
num_grid_models <- nrow(collect_metrics(grid_results, summarize = FALSE))
grid_results %>%
rank_results() %>%
filter(.metric == "rmse") %>%
select(model, .config, rmse = mean, rank)
autoplot(
grid_results,
rank_metric = "rmse", # <- how to order models
metric = "rmse", # <- which metric to visualize
select_best = TRUE # <- one point per workflow
) +
geom_text(aes(y = mean - 1/2, label = wflow_id), angle = 90, hjust = 1) +
lims(y = c(3.5, 9.5)) +
theme(legend.position = "none")
library(finetune)
concrete_folds <-
vfold_cv(concrete_train, strata = compressive_strength, repeats=1, v=5)
race_ctrl <-
control_race(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
race_results <-
all_workflows %>%
workflow_map(
"tune_race_anova",
seed = 1503,
resamples = concrete_folds,
grid = 5,
control = race_ctrl,
verbose = TRUE
)
## i 1 of 12 tuning: MARS
## ✔ 1 of 12 tuning: MARS (2.7s)
## i 2 of 12 tuning: CART
## ✔ 2 of 12 tuning: CART (3.2s)
## i No tuning parameters. `fit_resamples()` will be attempted
## i 3 of 12 resampling: CART_bagged
## ✔ 3 of 12 resampling: CART_bagged (13.9s)
## i 4 of 12 tuning: RF
## i Creating pre-processing data to finalize unknown parameter: mtry
## ✔ 4 of 12 tuning: RF (7.4s)
## i 5 of 12 tuning: boosting
## ✔ 5 of 12 tuning: boosting (16.7s)
## i 6 of 12 tuning: Cubist
## ✔ 6 of 12 tuning: Cubist (9s)
## i 7 of 12 tuning: SVM_radial
## ✔ 7 of 12 tuning: SVM_radial (4.7s)
## i 8 of 12 tuning: SVM_poly
## ✔ 8 of 12 tuning: SVM_poly (10.6s)
## i 9 of 12 tuning: KNN
## ✔ 9 of 12 tuning: KNN (8.1s)
## i 10 of 12 tuning: neural_network
## ✔ 10 of 12 tuning: neural_network (13.5s)
## i 11 of 12 tuning: full_quad_linear_reg
## ✔ 11 of 12 tuning: full_quad_linear_reg (6.9s)
## i 12 of 12 tuning: full_quad_KNN
## ✔ 12 of 12 tuning: full_quad_KNN (47.7s)
autoplot(
race_results,
rank_metric = "rmse",
metric = "rmse",
select_best = TRUE
) +
geom_text(aes(y = mean - 1/2, label = wflow_id), angle = 90, hjust = 1) +
lims(y = c(3.0, 9.5)) +
theme(legend.position = "none")
best_results <-
race_results %>%
extract_workflow_set_result("boosting") %>%
select_best(metric = "rmse")
best_results
boosting_test_results <-
race_results %>%
extract_workflow("boosting") %>%
finalize_workflow(best_results) %>%
last_fit(split = concrete_split)
collect_metrics(boosting_test_results)
boosting_test_results %>%
collect_predictions() %>%
ggplot(aes(x = compressive_strength, y = .pred)) +
geom_abline(color = "gray50", lty = 2) +
geom_point(alpha = 0.5) +
coord_obs_pred() +
labs(x = "observed", y = "predicted")