• 1 Tidymodels
  • 2 GLM
  • 3 Regularlized Regression

1 Tidymodels

Last week we looked at the tidymodels pipeline for the first time. broom (tidy, glance, augment), rsample (splitting data), parsnip (specifying models), and yardstick (evaluating models). For a brief review look at the solution to coding warmup 3 part 4.

Some more helpful tools.

Checking model specification

parsnip::rand_forest() %>%  
  set_engine("ranger") %>%
  set_mode("regression") %>% 
  translate()
## Random Forest Model Specification (regression)
## 
## Computational engine: ranger 
## 
## Model fit template:
## ranger::ranger(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
##     num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 
##         1))
#?rand_forest

rand_forest(trees = 1000, min_n = 5) %>% 
  set_engine("ranger", verbose = TRUE) %>% 
  set_mode("regression") %>%
  translate()
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 1000
##   min_n = 5
## 
## Engine-Specific Arguments:
##   verbose = TRUE
## 
## Computational engine: ranger 
## 
## Model fit template:
## ranger::ranger(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
##     num.trees = 1000, min.node.size = min_rows(~5, x), verbose = TRUE, 
##     num.threads = 1, seed = sample.int(10^5, 1))
#parsnip_addin()

As an aside, what is a random forest? It is an ensemble model (meaning it aggregates information from multiple models) which uses numerous decision trees to make predictions (many trees make a forest). In a classification case, usually the prediction is the class selected by the most trees. In the regression case, the average prediction from individual trees is returned.

2 GLM

Let’s run a very basic example on Lending Club data.

library(modeldata)
data("lending_club")

glimpse(lending_club)
## Rows: 9,857
## Columns: 23
## $ funded_amnt                <int> 16100, 32000, 10000, 16800, 3500, 10000, 11~
## $ term                       <fct> term_36, term_60, term_36, term_60, term_36~
## $ int_rate                   <dbl> 13.99, 11.99, 16.29, 13.67, 7.39, 11.47, 5.~
## $ sub_grade                  <fct> C4, C1, D1, C3, A4, B5, A1, B2, B3, C2, C3,~
## $ addr_state                 <fct> CT, MN, OH, NV, CA, TX, KY, MO, NY, GA, NH,~
## $ verification_status        <fct> Not_Verified, Verified, Source_Verified, Ve~
## $ annual_inc                 <dbl> 35000, 72000, 72000, 101000, 50100, 32000, ~
## $ emp_length                 <fct> emp_5, emp_ge_10, emp_ge_10, emp_lt_1, emp_~
## $ delinq_2yrs                <int> 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0~
## $ inq_last_6mths             <int> 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 3, 3, 0, 0, 1~
## $ revol_util                 <dbl> 67.4, 82.2, 74.2, 64.0, 78.3, 68.1, 54.8, 3~
## $ acc_now_delinq             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ open_il_6m                 <int> 1, 5, 3, 6, 0, 1, 2, 8, 1, 0, 3, 1, 2, 1, 2~
## $ open_il_12m                <int> 0, 1, 2, 1, 0, 0, 0, 2, 0, 0, 1, 2, 1, 0, 4~
## $ open_il_24m                <int> 0, 3, 3, 2, 0, 0, 0, 5, 0, 0, 1, 5, 1, 0, 6~
## $ total_bal_il               <int> 1099, 49187, 33378, 55445, 0, 7574, 20998, ~
## $ all_util                   <int> 48, 77, 79, 64, 78, 66, 54, 80, 75, 36, 43,~
## $ inq_fi                     <int> 0, 0, 1, 1, 0, 0, 0, 0, 2, 1, 1, 1, 0, 1, 7~
## $ inq_last_12m               <int> 3, 0, 4, 4, 0, 0, 0, 4, 6, 1, 3, 5, 1, 0, 5~
## $ delinq_amnt                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ num_il_tl                  <int> 3, 9, 9, 10, 3, 9, 7, 21, 7, 1, 7, 19, 7, 6~
## $ total_il_high_credit_limit <int> 13193, 65945, 39387, 60188, 0, 12131, 39486~
## $ Class                      <fct> good, good, good, good, good, good, good, g~
lending_club <- lending_club %>%
  mutate(Class = relevel(Class, "good"))

lending_club_mini <- lending_club %>%
  select(Class, annual_inc, verification_status, sub_grade)

lending_club_mini %>% count(Class)
ABCDEFGHIJ0123456789
Class
<fct>
n
<int>
good9340
bad517

Each row is a loan which is either Classified as good or bad.

show_engines("logistic_reg")
ABCDEFGHIJ0123456789
engine
<chr>
mode
<chr>
glmclassification
glmnetclassification
LiblineaRclassification
sparkclassification
kerasclassification
stanclassification
bruleeclassification
split <- initial_split(lending_club_mini)

train <- training(split)
test <- testing(split)

train %>% count(Class)
ABCDEFGHIJ0123456789
Class
<fct>
n
<int>
good7005
bad387

We can see that there is a really large class imbalance. Most loans are good. This is something to watch out for! If our model ‘predicts’ every situation as good it will perform deceptively well.

log_model <- logistic_reg() %>%
  set_engine('glm') %>%
  set_mode('classification') %>%
  fit(Class ~ annual_inc + verification_status + sub_grade,
      data = train)

log_model %>% tidy()
ABCDEFGHIJ0123456789
term
<chr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
p.value
<dbl>
(Intercept)-5.203664e+005.912702e-01-8.800822491.358170e-18
annual_inc1.423934e-069.886871e-071.440226771.498033e-01
verification_statusSource_Verified4.578449e-021.410139e-010.324680637.454228e-01
verification_statusVerified2.115456e-011.450028e-011.458907251.445906e-01
sub_gradeA2-2.351925e-011.158265e+00-0.203055908.390913e-01
sub_gradeA36.909194e-018.210430e-010.841514344.000599e-01
sub_gradeA4-3.312545e-011.158238e+00-0.285998577.748792e-01
sub_gradeA59.448456e-017.119087e-011.327200491.844424e-01
sub_gradeB17.977799e-017.111029e-011.121890942.619088e-01
sub_gradeB25.811782e-017.343734e-010.791393274.287145e-01
# log_model %>%
#   car::vif()

log_model %>% extract_fit_engine() %>%
  car::vif()
##                         GVIF Df GVIF^(1/(2*Df))
## annual_inc          1.028086  1        1.013946
## verification_status 1.095369  2        1.023034
## sub_grade           1.117908 34        1.001640

Multicollinearity is a big issue. If variables are correlated to both the target variable(s) and themselves, the model will not be able to distuingish between their influence. Variance Inflation Factors, (vif 1, orthogonal/uncorrelated, >5 issue).

preds <- log_model %>%
  predict(test, type='prob')

preds <- log_model %>%
  predict(test, type='class')

test_preds <- log_model %>% augment(test)

test_preds %>% select(Class, .pred_class, .pred_good, .pred_bad)
ABCDEFGHIJ0123456789
Class
<fct>
.pred_class
<fct>
.pred_good
<dbl>
.pred_bad
<dbl>
goodgood0.91724748.275263e-02
goodgood0.97286042.713963e-02
goodgood0.97011462.988543e-02
goodgood0.96011883.988117e-02
goodgood0.95429494.570508e-02
goodgood0.95812374.187631e-02
goodgood0.97063312.936687e-02
goodgood0.93410786.589217e-02
goodgood0.96399163.600837e-02
goodgood0.96011883.988117e-02

We can predict a class metric or a class probability metric (recall from yardstick).

test_preds %>% count(Class, .pred_class)
ABCDEFGHIJ0123456789
Class
<fct>
.pred_class
<fct>
n
<int>
goodgood2333
goodbad2
badgood130
ggplot(test_preds, aes(x=.pred_good, color=Class)) + geom_density()

test_preds %>% group_by(Class) %>% summarize(mean(.pred_good), median(.pred_good))
ABCDEFGHIJ0123456789
Class
<fct>
mean(.pred_good)
<dbl>
median(.pred_good)
<dbl>
good0.95066770.9604447
bad0.91231240.9205804
roc_auc(test_preds,
        truth = Class,
        estimate = .pred_good)
ABCDEFGHIJ0123456789
.metric
<chr>
.estimator
<chr>
.estimate
<dbl>
roc_aucbinary0.740252
roc_curve(test_preds,
        truth = Class,
        estimate = .pred_good) %>%
  autoplot()

specificity(test_preds,
        truth = Class,
        estimate = .pred_class )
ABCDEFGHIJ0123456789
.metric
<chr>
.estimator
<chr>
.estimate
<dbl>
specificitybinary0
sensitivity(test_preds,
        truth = Class,
        estimate = .pred_class)
ABCDEFGHIJ0123456789
.metric
<chr>
.estimator
<chr>
.estimate
<dbl>
sensitivitybinary0.9991435
test_preds %>% count(.pred_class)
ABCDEFGHIJ0123456789
.pred_class
<fct>
n
<int>
good2463
bad2

We can see that this model always predicts good, but this model still has some ability to distinguish between classes. We just need to look at setting different probability thresholds. This will be something we explore more as the class goes on so don’t worry if this seems complicated!

The probably package lets us find the threshold which maximizes sensitivity (true positives) and specificity (true negatives).

threshold_data <- test_preds %>%
  probably::threshold_perf(Class, .pred_good, thresholds = seq(0.5, 1, by = 0.0025))


ggplot(threshold_data %>% filter(.metric != 'distance'), 
       aes(x = .threshold, y = .estimate, color = .metric)) +
  geom_line() +
  scale_color_viridis_d(end = 0.9) +
  scale_alpha_manual(values = c(.4, 1), guide = "none") +
  labs(
    x = "'Good' Threshold\n(above this value is considered 'good')",
    y = "Metric Estimate",
    title = "Balancing performance by varying the threshold",
    subtitle = "Sensitivity or specificity alone might not be enough!\nVertical line = Max J-Index"
  )

max_threshold <- threshold_data %>% filter(.metric == 'j_index') %>%
  filter(.estimate==max(.estimate)) %>%
  pull(.threshold)

test_preds2 <- test_preds %>%
  mutate(.pred = probably::make_two_class_pred(.pred_good, levels(Class), threshold = max_threshold)) %>%
  select(Class, contains(".pred"))

test_preds2 %>% count(.pred)
ABCDEFGHIJ0123456789
.pred
<clss_prd>
n
<int>
good1648
bad817
precision(test_preds2,
        truth = Class,
        estimate = .pred_class) 
ABCDEFGHIJ0123456789
.metric
<chr>
.estimator
<chr>
.estimate
<dbl>
precisionbinary0.9472188
recall(test_preds2,
        truth = Class,
        estimate = .pred_class) 
ABCDEFGHIJ0123456789
.metric
<chr>
.estimator
<chr>
.estimate
<dbl>
recallbinary0.9991435
f_meas(test_preds2,
        truth = Class,
        estimate = .pred_class) 
ABCDEFGHIJ0123456789
.metric
<chr>
.estimator
<chr>
.estimate
<dbl>
f_measbinary0.9724885

3 Regularlized Regression

split <- initial_split(lending_club)

train <- training(split)
test <- testing(split)


log_model2 <- logistic_reg(penalty = 0.1) %>%
  set_engine('glmnet') %>%
  set_mode('classification') %>%
  fit(Class ~ .,
      data = train)

log_model2 %>% tidy() %>% filter(abs(estimate) > 0)
ABCDEFGHIJ0123456789
term
<chr>
estimate
<dbl>
penalty
<dbl>
(Intercept)-2.8610570.1
log_model3 <- logistic_reg(penalty = 0.01) %>%
  set_engine('glmnet') %>%
  set_mode('classification') %>%
  fit(Class ~ .,
      data = train)

log_model3 %>% tidy() %>% filter(abs(estimate) > 0)
ABCDEFGHIJ0123456789
term
<chr>
estimate
<dbl>
penalty
<dbl>
(Intercept)-4.688017300.01
int_rate0.124102080.01
open_il_12m0.062487620.01
inq_fi0.029347690.01
test_preds_l3 <- log_model3 %>% augment(test)

roc_curve(test_preds_l3,
        truth = Class,
        estimate = .pred_good) %>%
  autoplot()

roc_auc(test_preds_l3,
        truth = Class,
        estimate = .pred_good)
ABCDEFGHIJ0123456789
.metric
<chr>
.estimator
<chr>
.estimate
<dbl>
roc_aucbinary0.7364023
log_model4 <- logistic_reg(penalty = 0.001) %>%
  set_engine('glmnet') %>%
  set_mode('classification') %>%
  fit(Class ~ .,
      data = train)

log_model4 %>% tidy() %>% filter(abs(estimate) > 0)
ABCDEFGHIJ0123456789
term
<chr>
estimate
<dbl>
penalty
<dbl>
(Intercept)-5.342111e+000.001
funded_amnt2.656272e-060.001
termterm_60-3.404180e-010.001
int_rate1.605080e-010.001
sub_gradeA2-3.515330e-010.001
sub_gradeA4-6.659451e-010.001
sub_gradeB2-7.606505e-010.001
sub_gradeC11.062150e-010.001
sub_gradeC41.408598e-010.001
sub_gradeC54.157332e-010.001
test_preds_l4 <- log_model4 %>% augment(test)

roc_curve(test_preds_l4,
        truth = Class,
        estimate = .pred_good) %>%
  autoplot()

roc_auc(test_preds_l4,
        truth = Class,
        estimate = .pred_good)
ABCDEFGHIJ0123456789
.metric
<chr>
.estimator
<chr>
.estimate
<dbl>
roc_aucbinary0.7453297