p1 <- "database/detroit.sqlite"
p2 <- "static/solutions/database/detroit.sqlite"
if(dir.exists("static/lectures")){
model_fit <- readRDS("static/lectures/model_fit.RDS")
modeldata <- readRDS("static/lectures/modeldata.RDS")
} else {
base_str <- "https://github.com/erhla/pa470spring2022/raw/main/static/lectures/"
model_fit <- readRDS(url("https://github.com/erhla/pa470spring2022/raw/main/static/lectures/model_fit.RDS"))
modeldata <- readRDS(url("https://github.com/erhla/pa470spring2022/raw/main/static/lectures/modeldata.RDS"))
}
modeldata <- modeldata %>%
select(sp_log, parcel_num, Neighborhood, ASSESSEDVALUE,
total_squa, heightcat, extcat, has_garage,
has_basement, bathcat, total_floo, pct_over_50,
foreclosures_per_prop,
year_built, GEOID, sale_date) %>%
mutate(across(where(is.character), as.factor))
Why do our models make the predictions they do? Model explanations can shed some light on this. For this assignment, I have saved my model data and fitted model from the Detroit Assessment project.
vip
functions when we want to use model-based
methods that take advantage of model structure (and are often faster),
andDALEX
functions when we want to use
model-agnostic methods that can be applied to any model.Let’s look at building a model-agnostic explainer for our model.
library(DALEXtra)
## Loading required package: DALEX
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## Additional features will be available after installation of: ggpubr.
## Use 'install_dependencies()' to get all suggested dependencies
##
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
##
## explain
sales <- modeldata %>% filter(!is.na(sp_log), year_built != 0)
explain_reg <- explain_tidymodels(
model_fit,
data = sales,
y = sales$sp_log,
verbose=TRUE
)
## Preparation of a new explainer is initiated
## -> model label : workflow ( default )
## -> data : 4336 rows 16 cols
## -> data : tibble converted into a data.frame
## -> target variable : 4336 values
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.0.0 , task regression ( default )
## -> predicted values : numerical, min = 3.836263 , mean = 4.510255 , max = 5.634867
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = -1.258597 , mean = -0.1755897 , max = 0.6319306
## A new explainer has been created!
A local explanation will provide information about a prediction for a single observation.
targ <- modeldata %>% filter(parcel_num == '22027164.')
targ %>% t() %>%
kableExtra::kable() %>%
kableExtra::kable_material(c("striped", "hover"))
sp_log | 4.845098 |
parcel_num |
|
Neighborhood | 0033A |
ASSESSEDVALUE | 19900 |
total_squa | 4792 |
heightcat | 2 |
extcat | 3 |
has_garage | 1 |
has_basement | 1 |
bathcat | 1 |
total_floo | 1462 |
pct_over_50 | 0.6666667 |
foreclosures_per_prop | 0.1332053 |
year_built | 1941 |
GEOID | 26163539400 |
sale_date | 2019-01-01 |
preds <- predict_parts(explainer = explain_reg,
new_observation = targ)
preds
It is pretty expensive to compute important features. Best practice is to use Shapley Additive Explanations (SHAP) “where the average contributions of features are computed under different combinations or “coalitions” of feature orderings.”
preds_shap <-
predict_parts(
explainer = explain_reg,
new_observation = targ,
type = "shap",
B = 20
)
plot(preds_shap)
Let’s look at another property.
targ2 <- modeldata %>% filter(parcel_num == '08009807.')
predict_parts(
explainer = explain_reg,
new_observation = targ2,
type = "shap",
B = 20
) %>% plot()
The explanation is local, so each data point has unique contributions.
Also called variable importance. These explanations can help us understand which features are most important in driving predictions aggregated over all the data.
One way to compute vip is to permute the features and measure how much worse the model is when the data is fitted before/after shuffling.
vip_model <- model_parts(explain_reg,
loss_function = loss_root_mean_square)
plot(vip_model)
Mirroring Figure 18.4 from the textbook.
ggplot_imp <- function(...) {
obj <- list(...)
metric_name <- attr(obj[[1]], "loss_name")
metric_lab <- paste(metric_name,
"after permutations\n(higher indicates more important)")
full_vip <- bind_rows(obj) %>%
filter(variable != "_baseline_")
perm_vals <- full_vip %>%
filter(variable == "_full_model_") %>%
group_by(label) %>%
summarise(dropout_loss = mean(dropout_loss))
p <- full_vip %>%
filter(variable != "_full_model_") %>%
mutate(variable = fct_reorder(variable, dropout_loss)) %>%
ggplot(aes(dropout_loss, variable))
if(length(obj) > 1) {
p <- p +
facet_wrap(vars(label)) +
geom_vline(data = perm_vals, aes(xintercept = dropout_loss, color = label),
size = 1.4, lty = 2, alpha = 0.7) +
geom_boxplot(aes(color = label, fill = label), alpha = 0.2)
} else {
p <- p +
geom_vline(data = perm_vals, aes(xintercept = dropout_loss),
size = 1.4, lty = 2, alpha = 0.7) +
geom_boxplot(fill = "#91CBD765", alpha = 0.4)
}
p +
theme(legend.position = "none") +
labs(x = metric_lab,
y = NULL, fill = NULL, color = NULL)
}
ggplot_imp(vip_model)
We can also build global model explanations by aggregating local explanations with partial dependence profiles (imagine a derivative).
ggplot_pdp <- function(obj, x) {
p <-
as_tibble(obj$agr_profiles) %>%
mutate(`_label_` = stringr::str_remove(`_label_`, "^[^_]*_")) %>%
ggplot(aes(`_x_`, `_yhat_`)) +
geom_line(data = as_tibble(obj$cp_profiles),
aes(x = {{ x }}, group = `_ids_`),
size = 0.5, alpha = 0.05, color = "gray50")
num_colors <- n_distinct(obj$agr_profiles$`_label_`)
if (num_colors > 1) {
p <- p + geom_line(aes(color = `_label_`), size = 1.2, alpha = 0.8)
} else {
p <- p + geom_line(color = "midnightblue", size = 1.2, alpha = 0.8)
}
p
}
pdp_age <- model_profile(explain_reg,
N=500,
variable = "year_built")
ggplot_pdp(pdp_age, year_built) +
labs(x = "Year built",
y = "Sale Price (log)",
color = NULL)
pdp_floor <- model_profile(explain_reg,
N=500,
variable = "total_floo")
ggplot_pdp(pdp_age, total_floo) +
scale_x_log10() +
labs(x = "Total Floor Area",
y = "Sale Price (log)",
color = NULL)