p1 <- "database/detroit.sqlite"
p2 <- "static/solutions/database/detroit.sqlite"

if(dir.exists("static/lectures")){
  model_fit <- readRDS("static/lectures/model_fit.RDS")
  modeldata <- readRDS("static/lectures/modeldata.RDS") 
} else {
  base_str <- "https://github.com/erhla/pa470spring2022/raw/main/static/lectures/"
  model_fit <- readRDS(url("https://github.com/erhla/pa470spring2022/raw/main/static/lectures/model_fit.RDS"))
  modeldata <- readRDS(url("https://github.com/erhla/pa470spring2022/raw/main/static/lectures/modeldata.RDS"))
}


modeldata <- modeldata %>% 
  select(sp_log, parcel_num, Neighborhood, ASSESSEDVALUE,
    total_squa, heightcat, extcat, has_garage,
    has_basement, bathcat, total_floo, pct_over_50,
    foreclosures_per_prop,
    year_built, GEOID, sale_date) %>% 
  mutate(across(where(is.character), as.factor))

1 Ch 18, Explaining Models/Predictions

Why do our models make the predictions they do? Model explanations can shed some light on this. For this assignment, I have saved my model data and fitted model from the Detroit Assessment project.

1.1 Techniques

  • vip functions when we want to use model-based methods that take advantage of model structure (and are often faster), and
  • DALEX functions when we want to use model-agnostic methods that can be applied to any model.

Let’s look at building a model-agnostic explainer for our model.

library(DALEXtra)
## Loading required package: DALEX
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## Additional features will be available after installation of: ggpubr.
## Use 'install_dependencies()' to get all suggested dependencies
## 
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
## 
##     explain
sales <- modeldata %>% filter(!is.na(sp_log), year_built != 0)

explain_reg <- explain_tidymodels(
  model_fit,
  data = sales,
  y = sales$sp_log,
  verbose=TRUE
)
## Preparation of a new explainer is initiated
##   -> model label       :  workflow  (  default  )
##   -> data              :  4336  rows  16  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  4336  values 
##   -> predict function  :  yhat.workflow  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package tidymodels , ver. 1.0.0 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  3.836263 , mean =  4.510255 , max =  5.634867  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -1.258597 , mean =  -0.1755897 , max =  0.6319306  
##   A new explainer has been created!

1.2 Local Explanations

A local explanation will provide information about a prediction for a single observation.

targ <- modeldata %>% filter(parcel_num == '22027164.')

targ %>% t() %>%
  kableExtra::kable() %>%
  kableExtra::kable_material(c("striped", "hover"))
sp_log 4.845098
parcel_num
Neighborhood 0033A
ASSESSEDVALUE 19900
total_squa 4792
heightcat 2
extcat 3
has_garage 1
has_basement 1
bathcat 1
total_floo 1462
pct_over_50 0.6666667
foreclosures_per_prop 0.1332053
year_built 1941
GEOID 26163539400
sale_date 2019-01-01
preds <- predict_parts(explainer = explain_reg, 
              new_observation = targ)

preds

It is pretty expensive to compute important features. Best practice is to use Shapley Additive Explanations (SHAP) “where the average contributions of features are computed under different combinations or “coalitions” of feature orderings.”

preds_shap <- 
  predict_parts(
    explainer = explain_reg, 
    new_observation = targ, 
    type = "shap",
    B = 20
  )

plot(preds_shap)

Let’s look at another property.

targ2 <- modeldata %>% filter(parcel_num == '08009807.')

predict_parts(
  explainer = explain_reg,
  new_observation = targ2,
  type = "shap",
  B = 20
) %>% plot()

The explanation is local, so each data point has unique contributions.

1.3 Global Explanation

Also called variable importance. These explanations can help us understand which features are most important in driving predictions aggregated over all the data.

One way to compute vip is to permute the features and measure how much worse the model is when the data is fitted before/after shuffling.

vip_model <- model_parts(explain_reg, 
                    loss_function = loss_root_mean_square)

plot(vip_model)

Mirroring Figure 18.4 from the textbook.

ggplot_imp <- function(...) {
  obj <- list(...)
  metric_name <- attr(obj[[1]], "loss_name")
  metric_lab <- paste(metric_name, 
                      "after permutations\n(higher indicates more important)")
  
  full_vip <- bind_rows(obj) %>%
    filter(variable != "_baseline_")
  
  perm_vals <- full_vip %>% 
    filter(variable == "_full_model_") %>% 
    group_by(label) %>% 
    summarise(dropout_loss = mean(dropout_loss))
  
  p <- full_vip %>%
    filter(variable != "_full_model_") %>% 
    mutate(variable = fct_reorder(variable, dropout_loss)) %>%
    ggplot(aes(dropout_loss, variable)) 
  if(length(obj) > 1) {
    p <- p + 
      facet_wrap(vars(label)) +
      geom_vline(data = perm_vals, aes(xintercept = dropout_loss, color = label),
                 size = 1.4, lty = 2, alpha = 0.7) +
      geom_boxplot(aes(color = label, fill = label), alpha = 0.2)
  } else {
    p <- p + 
      geom_vline(data = perm_vals, aes(xintercept = dropout_loss),
                 size = 1.4, lty = 2, alpha = 0.7) +
      geom_boxplot(fill = "#91CBD765", alpha = 0.4)
    
  }
  p +
    theme(legend.position = "none") +
    labs(x = metric_lab, 
         y = NULL,  fill = NULL,  color = NULL)
}

ggplot_imp(vip_model)

1.4 Partial Dependence

We can also build global model explanations by aggregating local explanations with partial dependence profiles (imagine a derivative).

ggplot_pdp <- function(obj, x) {
  
  p <- 
    as_tibble(obj$agr_profiles) %>%
    mutate(`_label_` = stringr::str_remove(`_label_`, "^[^_]*_")) %>%
    ggplot(aes(`_x_`, `_yhat_`)) +
    geom_line(data = as_tibble(obj$cp_profiles),
              aes(x = {{ x }}, group = `_ids_`),
              size = 0.5, alpha = 0.05, color = "gray50")
  
  num_colors <- n_distinct(obj$agr_profiles$`_label_`)
  
  if (num_colors > 1) {
    p <- p + geom_line(aes(color = `_label_`), size = 1.2, alpha = 0.8)
  } else {
    p <- p + geom_line(color = "midnightblue", size = 1.2, alpha = 0.8)
  }
  
  p
}

pdp_age <- model_profile(explain_reg, 
                         N=500,
                         variable = "year_built")

ggplot_pdp(pdp_age, year_built)  +
  labs(x = "Year built", 
       y = "Sale Price (log)", 
       color = NULL)

pdp_floor <- model_profile(explain_reg, 
                         N=500,
                         variable = "total_floo")

ggplot_pdp(pdp_age, total_floo)  +
  scale_x_log10() +
  labs(x = "Total Floor Area", 
       y = "Sale Price (log)", 
       color = NULL)