Welcome and Schedule

6:30pm Introductions

  • Preferred Name & Pronouns
  • Recent book or something you learned

7:00pm Course Introduction

7:45pm Break + Debugging (15 minutes)

8:00pm Coding Review

Course Introduction

Course Motivation

What should the relationship between the public sector and ML/AI be? Think of a few words and put them in the chat.

A framework

Coding Philosophy

  • Reproducibility ** 2
  • Iterative
  • tidy
    • “the bottleneck in most data analysis is thinking time, not computing time.” ~tidyverse manifesto

Review Coding Warmup 0

Break

Coding Review/EDA

library(tidyverse)

file_loc <- '~/../Downloads/'

ccao <- read_csv(str_glue(file_loc, 
                          'Assessor__Archived_05-11-2022__-_Residential_Modeling_Characteristics__Chicago_.csv'))

str(ccao)
## spc_tbl_ [678,656 x 80] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Property Index Number                    : chr [1:678656] "17-09-244-018-1031" "19-01-102-023-0000" "14-32-227-010-0000" "20-06-401-024-0000" ...
##  $ Assessment Triad                         : chr [1:678656] "City" "City" "City" "City" ...
##  $ Property Class                           : num [1:678656] 299 203 205 211 203 212 212 205 299 211 ...
##  $ Property Address                         : chr [1:678656] NA "2952 W 39TH PL" "2039 N FREMONT ST" "4359 S WOLCOTT AVE" ...
##  $ Property Apartment Num.                  : chr [1:678656] NA NA NA NA ...
##  $ Property City                            : chr [1:678656] NA "CHICAGO" "CHICAGO" "CHICAGO" ...
##  $ Township Name                            : chr [1:678656] "North Chicago" "Lake" "North Chicago" "Lake" ...
##  $ Township Code                            : num [1:678656] 74 72 74 72 71 71 70 77 74 72 ...
##  $ Neighborhood Code                        : num [1:678656] 74030 72040 74012 72080 71430 ...
##  $ Township Neighborhood Only               : chr [1:678656] "030" "040" "012" "080" ...
##  $ Modeling Group                           : chr [1:678656] "NCHARS" "SF" "SF" "MF" ...
##  $ Proportion Assessed                      : num [1:678656] 0.00166 1 1 1 1 ...
##  $ Key PIN                                  : chr [1:678656] "17-09-244-018-1001" NA NA NA ...
##  $ Multi Code Indicator                     : num [1:678656] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Multi Code                               : num [1:678656] 2 2 NA 2 2 NA NA NA 2 2 ...
##  $ Number of Active 288s (HIEs)             : num [1:678656] NA 0 0 0 0 0 0 0 NA 0 ...
##  $ Number of Ended 288s (HIEs)              : num [1:678656] NA 0 0 0 0 0 0 0 NA 0 ...
##  $ Sale Date                                : chr [1:678656] NA NA NA NA ...
##  $ Sale Price                               : num [1:678656] NA NA NA NA NA NA NA NA NA NA ...
##  $ Sale Document Number                     : num [1:678656] NA NA NA NA NA NA NA NA NA NA ...
##  $ Land Square Feet                         : num [1:678656] NA 3100 6250 2812 5953 ...
##  $ Large Home Indicator                     : logi [1:678656] NA FALSE FALSE FALSE FALSE FALSE ...
##  $ Age                                      : num [1:678656] 19 96 132 126 65 69 97 77 17 100 ...
##  $ Apartments                               : chr [1:678656] NA NA NA "Three" ...
##  $ Wall Material                            : chr [1:678656] NA "Frame" NA "Frame" ...
##  $ Roof Material                            : chr [1:678656] NA "Shingle + Asphalt" NA "Shingle + Asphalt" ...
##  $ Rooms                                    : num [1:678656] NA 4 0 20 5 0 0 0 NA 7 ...
##  $ Bedrooms                                 : num [1:678656] NA 2 0 10 2 0 0 0 NA 5 ...
##  $ Basement                                 : chr [1:678656] NA "Slab" NA "Full" ...
##  $ Basement Finish                          : chr [1:678656] NA "Unfinished" NA "Apartment" ...
##  $ Central Heating                          : chr [1:678656] NA "Warm Air Furnace" NA "None" ...
##  $ Central Air Conditioning                 : chr [1:678656] NA "No Central A/C" NA "No Central A/C" ...
##  $ Fireplaces                               : num [1:678656] NA 0 NA 0 0 NA NA NA NA 0 ...
##  $ Attic Type                               : chr [1:678656] NA "None" NA "None" ...
##  $ Full Baths                               : num [1:678656] NA 1 0 5 1 0 0 0 NA 3 ...
##  $ Half Baths                               : num [1:678656] NA 0 0 0 0 0 0 0 NA 0 ...
##  $ Design Plan                              : chr [1:678656] NA NA NA "Stock Plan" ...
##  $ Cathedral Ceiling                        : chr [1:678656] NA NA NA NA ...
##  $ Garage 1 Size                            : chr [1:678656] NA "0 cars" NA "0 cars" ...
##  $ Garage 1 Material                        : chr [1:678656] NA NA NA NA ...
##  $ Garage 1 Attached                        : logi [1:678656] NA NA NA NA FALSE NA ...
##  $ Garage 1 Area                            : logi [1:678656] NA NA NA NA FALSE NA ...
##  $ Building Square Feet                     : num [1:678656] NA 1044 0 4320 1098 ...
##  $ Repair Condition                         : chr [1:678656] NA "Average" NA "Average" ...
##  $ Use                                      : chr [1:678656] NA "Single-Family" NA "Multi-Family" ...
##  $ Type of Residence                        : chr [1:678656] NA "1 Story" NA "2 Story" ...
##  $ Attic Finish                             : chr [1:678656] NA NA NA NA ...
##  $ Renovation                               : chr [1:678656] NA NA NA NA ...
##  $ Porch                                    : chr [1:678656] NA NA NA NA ...
##  $ Property Zip Code                        : chr [1:678656] NA "60632-1804" "60614-4363" "60609-3124" ...
##  $ Tract GEOID                              : num [1:678656] 1.7e+10 1.7e+10 1.7e+10 1.7e+10 1.7e+10 ...
##  $ Municipality FIPS Code                   : num [1:678656] 14000 14000 14000 14000 14000 14000 14000 14000 14000 14000 ...
##  $ Municipality Name                        : chr [1:678656] "Chicago" "Chicago" "Chicago" "Chicago" ...
##  $ O'Hare Noise Indicator                   : num [1:678656] 0 0 0 0 0 0 0 0 0 0 ...
##  $ FEMA Floodplain                          : num [1:678656] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Flood Risk Factor                        : num [1:678656] 4 6 1 5 1 3 1 3 1 1 ...
##  $ Flood Risk Direction                     : num [1:678656] 0 0 0 0 0 1 0 1 0 0 ...
##  $ Road Proximity < 100 Feet                : num [1:678656] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Road Proximity 101 - 300 Feet            : num [1:678656] 0 0 0 0 0 0 1 0 1 0 ...
##  $ Elementary/Middle School District        : chr [1:678656] "OGDEN" "DAVIS, N" "MAYER" "LARA" ...
##  $ High School District                     : chr [1:678656] "WELLS HS" "KELLY HS" "LINCOLN PARK HS" "RICHARDS HS" ...
##  $ Longitude                                : num [1:678656] -87.6 -87.7 -87.7 -87.7 -87.8 ...
##  $ Latitude                                 : num [1:678656] 41.9 41.8 41.9 41.8 42 ...
##  $ geoposition                              : chr [1:678656] "POINT (-87.6344132462695 41.8913692293796)" "POINT (-87.6992293705144 41.8220515067538)" "POINT (-87.6506234192346 41.9193306987512)" "POINT (-87.6720496981807 41.8141134023529)" ...
##  $ Tax Rate                                 : num [1:678656] 6.89 6.93 6.89 6.93 6.89 ...
##  $ Tract Median Income                      : num [1:678656] 128243 39706 119432 33507 75601 ...
##  $ Condo Model: Building PIN                : chr [1:678656] "17-09-244-018" NA NA NA ...
##  $ Condo Model: Modeled Unit Strata (of 10) : chr [1:678656] NA NA NA NA ...
##  $ Condo Model: Modeled Unit Strata (of 300): chr [1:678656] NA NA NA NA ...
##  $ Modeled Sale Year                        : num [1:678656] 2021 2021 2021 2021 2021 ...
##  $ Modeled Sale Month of Year               : chr [1:678656] "Jan" "Jan" "Jan" "Jan" ...
##  $ Modeled Sale Day of Year                 : num [1:678656] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Modeled Sale Week of Year                : num [1:678656] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Modeled Sale Quarter of Year             : chr [1:678656] "Q1" "Q1" "Q1" "Q1" ...
##  $ Modeled Sale Quarter                     : num [1:678656] 96 96 96 96 96 96 96 96 96 96 ...
##  $ Modeled Sale Month                       : num [1:678656] 288 288 288 288 288 288 288 288 288 288 ...
##  $ Modeled Sale Week                        : num [1:678656] 1252 1252 1252 1252 1252 ...
##  $ Modeled Sale Day                         : num [1:678656] 8766 8766 8766 8766 8766 ...
##  $ Modeled Sale During School Year          : logi [1:678656] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ Modeled Sale During Holidays             : logi [1:678656] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Property Index Number` = col_character(),
##   ..   `Assessment Triad` = col_character(),
##   ..   `Property Class` = col_double(),
##   ..   `Property Address` = col_character(),
##   ..   `Property Apartment Num.` = col_character(),
##   ..   `Property City` = col_character(),
##   ..   `Township Name` = col_character(),
##   ..   `Township Code` = col_double(),
##   ..   `Neighborhood Code` = col_double(),
##   ..   `Township Neighborhood Only` = col_character(),
##   ..   `Modeling Group` = col_character(),
##   ..   `Proportion Assessed` = col_double(),
##   ..   `Key PIN` = col_character(),
##   ..   `Multi Code Indicator` = col_double(),
##   ..   `Multi Code` = col_double(),
##   ..   `Number of Active 288s (HIEs)` = col_double(),
##   ..   `Number of Ended 288s (HIEs)` = col_double(),
##   ..   `Sale Date` = col_character(),
##   ..   `Sale Price` = col_double(),
##   ..   `Sale Document Number` = col_double(),
##   ..   `Land Square Feet` = col_double(),
##   ..   `Large Home Indicator` = col_logical(),
##   ..   Age = col_double(),
##   ..   Apartments = col_character(),
##   ..   `Wall Material` = col_character(),
##   ..   `Roof Material` = col_character(),
##   ..   Rooms = col_double(),
##   ..   Bedrooms = col_double(),
##   ..   Basement = col_character(),
##   ..   `Basement Finish` = col_character(),
##   ..   `Central Heating` = col_character(),
##   ..   `Central Air Conditioning` = col_character(),
##   ..   Fireplaces = col_double(),
##   ..   `Attic Type` = col_character(),
##   ..   `Full Baths` = col_double(),
##   ..   `Half Baths` = col_double(),
##   ..   `Design Plan` = col_character(),
##   ..   `Cathedral Ceiling` = col_character(),
##   ..   `Garage 1 Size` = col_character(),
##   ..   `Garage 1 Material` = col_character(),
##   ..   `Garage 1 Attached` = col_logical(),
##   ..   `Garage 1 Area` = col_logical(),
##   ..   `Building Square Feet` = col_double(),
##   ..   `Repair Condition` = col_character(),
##   ..   Use = col_character(),
##   ..   `Type of Residence` = col_character(),
##   ..   `Attic Finish` = col_character(),
##   ..   Renovation = col_character(),
##   ..   Porch = col_character(),
##   ..   `Property Zip Code` = col_character(),
##   ..   `Tract GEOID` = col_double(),
##   ..   `Municipality FIPS Code` = col_double(),
##   ..   `Municipality Name` = col_character(),
##   ..   `O'Hare Noise Indicator` = col_double(),
##   ..   `FEMA Floodplain` = col_double(),
##   ..   `Flood Risk Factor` = col_double(),
##   ..   `Flood Risk Direction` = col_double(),
##   ..   `Road Proximity < 100 Feet` = col_double(),
##   ..   `Road Proximity 101 - 300 Feet` = col_double(),
##   ..   `Elementary/Middle School District` = col_character(),
##   ..   `High School District` = col_character(),
##   ..   Longitude = col_double(),
##   ..   Latitude = col_double(),
##   ..   geoposition = col_character(),
##   ..   `Tax Rate` = col_double(),
##   ..   `Tract Median Income` = col_double(),
##   ..   `Condo Model: Building PIN` = col_character(),
##   ..   `Condo Model: Modeled Unit Strata (of 10)` = col_character(),
##   ..   `Condo Model: Modeled Unit Strata (of 300)` = col_character(),
##   ..   `Modeled Sale Year` = col_double(),
##   ..   `Modeled Sale Month of Year` = col_character(),
##   ..   `Modeled Sale Day of Year` = col_double(),
##   ..   `Modeled Sale Week of Year` = col_double(),
##   ..   `Modeled Sale Quarter of Year` = col_character(),
##   ..   `Modeled Sale Quarter` = col_double(),
##   ..   `Modeled Sale Month` = col_double(),
##   ..   `Modeled Sale Week` = col_double(),
##   ..   `Modeled Sale Day` = col_double(),
##   ..   `Modeled Sale During School Year` = col_logical(),
##   ..   `Modeled Sale During Holidays` = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>
ccao %>% glimpse()
## Rows: 678,656
## Columns: 80
## $ `Property Index Number`                     <chr> "17-09-244-018-1031", "19-~
## $ `Assessment Triad`                          <chr> "City", "City", "City", "C~
## $ `Property Class`                            <dbl> 299, 203, 205, 211, 203, 2~
## $ `Property Address`                          <chr> NA, "2952 W 39TH PL", "203~
## $ `Property Apartment Num.`                   <chr> NA, NA, NA, NA, NA, NA, NA~
## $ `Property City`                             <chr> NA, "CHICAGO", "CHICAGO", ~
## $ `Township Name`                             <chr> "North Chicago", "Lake", "~
## $ `Township Code`                             <dbl> 74, 72, 74, 72, 71, 71, 70~
## $ `Neighborhood Code`                         <dbl> 74030, 72040, 74012, 72080~
## $ `Township Neighborhood Only`                <chr> "030", "040", "012", "080"~
## $ `Modeling Group`                            <chr> "NCHARS", "SF", "SF", "MF"~
## $ `Proportion Assessed`                       <dbl> 0.0016613, 1.0000000, 1.00~
## $ `Key PIN`                                   <chr> "17-09-244-018-1001", NA, ~
## $ `Multi Code Indicator`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Multi Code`                                <dbl> 2, 2, NA, 2, 2, NA, NA, NA~
## $ `Number of Active 288s (HIEs)`              <dbl> NA, 0, 0, 0, 0, 0, 0, 0, N~
## $ `Number of Ended 288s (HIEs)`               <dbl> NA, 0, 0, 0, 0, 0, 0, 0, N~
## $ `Sale Date`                                 <chr> NA, NA, NA, NA, NA, NA, NA~
## $ `Sale Price`                                <dbl> NA, NA, NA, NA, NA, NA, NA~
## $ `Sale Document Number`                      <dbl> NA, NA, NA, NA, NA, NA, NA~
## $ `Land Square Feet`                          <dbl> NA, 3100, 6250, 2812, 5953~
## $ `Large Home Indicator`                      <lgl> NA, FALSE, FALSE, FALSE, F~
## $ Age                                         <dbl> 19, 96, 132, 126, 65, 69, ~
## $ Apartments                                  <chr> NA, NA, NA, "Three", NA, N~
## $ `Wall Material`                             <chr> NA, "Frame", NA, "Frame", ~
## $ `Roof Material`                             <chr> NA, "Shingle + Asphalt", N~
## $ Rooms                                       <dbl> NA, 4, 0, 20, 5, 0, 0, 0, ~
## $ Bedrooms                                    <dbl> NA, 2, 0, 10, 2, 0, 0, 0, ~
## $ Basement                                    <chr> NA, "Slab", NA, "Full", "F~
## $ `Basement Finish`                           <chr> NA, "Unfinished", NA, "Apa~
## $ `Central Heating`                           <chr> NA, "Warm Air Furnace", NA~
## $ `Central Air Conditioning`                  <chr> NA, "No Central A/C", NA, ~
## $ Fireplaces                                  <dbl> NA, 0, NA, 0, 0, NA, NA, N~
## $ `Attic Type`                                <chr> NA, "None", NA, "None", "N~
## $ `Full Baths`                                <dbl> NA, 1, 0, 5, 1, 0, 0, 0, N~
## $ `Half Baths`                                <dbl> NA, 0, 0, 0, 0, 0, 0, 0, N~
## $ `Design Plan`                               <chr> NA, NA, NA, "Stock Plan", ~
## $ `Cathedral Ceiling`                         <chr> NA, NA, NA, NA, "No", NA, ~
## $ `Garage 1 Size`                             <chr> NA, "0 cars", NA, "0 cars"~
## $ `Garage 1 Material`                         <chr> NA, NA, NA, NA, "Frame", N~
## $ `Garage 1 Attached`                         <lgl> NA, NA, NA, NA, FALSE, NA,~
## $ `Garage 1 Area`                             <lgl> NA, NA, NA, NA, FALSE, NA,~
## $ `Building Square Feet`                      <dbl> NA, 1044, 0, 4320, 1098, 0~
## $ `Repair Condition`                          <chr> NA, "Average", NA, "Averag~
## $ Use                                         <chr> NA, "Single-Family", NA, "~
## $ `Type of Residence`                         <chr> NA, "1 Story", NA, "2 Stor~
## $ `Attic Finish`                              <chr> NA, NA, NA, NA, NA, NA, NA~
## $ Renovation                                  <chr> NA, NA, NA, NA, NA, NA, NA~
## $ Porch                                       <chr> NA, NA, NA, NA, NA, NA, NA~
## $ `Property Zip Code`                         <chr> NA, "60632-1804", "60614-4~
## $ `Tract GEOID`                               <dbl> 17031081800, 17031580501, ~
## $ `Municipality FIPS Code`                    <dbl> 14000, 14000, 14000, 14000~
## $ `Municipality Name`                         <chr> "Chicago", "Chicago", "Chi~
## $ `O'Hare Noise Indicator`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `FEMA Floodplain`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Flood Risk Factor`                         <dbl> 4, 6, 1, 5, 1, 3, 1, 3, 1,~
## $ `Flood Risk Direction`                      <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0,~
## $ `Road Proximity < 100 Feet`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Road Proximity 101 - 300 Feet`             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1,~
## $ `Elementary/Middle School District`         <chr> "OGDEN", "DAVIS, N", "MAYE~
## $ `High School District`                      <chr> "WELLS HS", "KELLY HS", "L~
## $ Longitude                                   <dbl> -87.63441, -87.69923, -87.~
## $ Latitude                                    <dbl> 41.89137, 41.82205, 41.919~
## $ geoposition                                 <chr> "POINT (-87.6344132462695 ~
## $ `Tax Rate`                                  <dbl> 6.890000, 6.934000, 6.8900~
## $ `Tract Median Income`                       <dbl> 128243, 39706, 119432, 335~
## $ `Condo Model: Building PIN`                 <chr> "17-09-244-018", NA, NA, N~
## $ `Condo Model: Modeled Unit Strata (of 10)`  <chr> NA, NA, NA, NA, NA, NA, NA~
## $ `Condo Model: Modeled Unit Strata (of 300)` <chr> NA, NA, NA, NA, NA, NA, NA~
## $ `Modeled Sale Year`                         <dbl> 2021, 2021, 2021, 2021, 20~
## $ `Modeled Sale Month of Year`                <chr> "Jan", "Jan", "Jan", "Jan"~
## $ `Modeled Sale Day of Year`                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ `Modeled Sale Week of Year`                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ `Modeled Sale Quarter of Year`              <chr> "Q1", "Q1", "Q1", "Q1", "Q~
## $ `Modeled Sale Quarter`                      <dbl> 96, 96, 96, 96, 96, 96, 96~
## $ `Modeled Sale Month`                        <dbl> 288, 288, 288, 288, 288, 2~
## $ `Modeled Sale Week`                         <dbl> 1252, 1252, 1252, 1252, 12~
## $ `Modeled Sale Day`                          <dbl> 8766, 8766, 8766, 8766, 87~
## $ `Modeled Sale During School Year`           <lgl> TRUE, TRUE, TRUE, TRUE, TR~
## $ `Modeled Sale During Holidays`              <lgl> TRUE, TRUE, TRUE, TRUE, TR~
    1. Explore/describe
    1. Count property classes
    1. Top Five Classes
# count property classes
ccao %>% count(`Property Class`)
ccao %>% group_by(`Property Class`) %>%
  summarize(n = n())
ccao %>% count(`Property Class`, sort=TRUE)
count(ccao, `Property Class`)
# top five classes

ccao %>% count(`Property Class`) %>%
  slice_max(order_by = `n`, n=5)
    1. Keep only townships with Chicago in the name
    1. Average rooms by property class
# keep only townships with Chicago in the name

ccao %>% distinct(`Township Name`)
ccao %>% filter(str_detect(`Township Name`, '(?<=g)o')) %>%
  count(`Township Name`)
# average rooms by property class

ccao %>% group_by(`Property Class`) %>%
  summarize(avg_rooms = mean(Rooms),
            count = n()) %>% view()
    1. understand dates with lubridate
    1. count sales in January 2019
    1. count sales by month
# lubridate

library(lubridate)

ccao %>% select(`Sale Date`)
ccao %>% select(contains('Date'))
ccao %>% count(`Sale Date`, sort=TRUE)
ccao <- ccao %>%
  mutate(
    `Sale Date` = mdy_hms(`Sale Date`)
  )

ccao %>% count(`Sale Date`)
# count sales in January 2019

ccao %>% filter(
  month(`Sale Date`) == 1 & year(`Sale Date`) == 2019
)
ccao %>% mutate(
  new_date = floor_date(`Sale Date`, 'month')
) %>% select(new_date) %>% filter(
  !is.na(new_date),
  new_date == ymd_hms('2019-01-01 00:00:00'))
# count sales by month

ccao %>% group_by(
  month = floor_date(`Sale Date`, 'month')
) %>% summarize(
  count = n()
)
    1. ggplot2
    1. sales by week plot
    1. sales price density
# ggplot2
library(ggplot2)

theme_set(theme_bw())

# sales by week

gdata <- ccao %>% 
  filter(!is.na(`Sale Date`)) %>%
  group_by(week = floor_date(`Sale Date`, 'week')) %>%
  summarize(count = n())

ggplot(gdata, aes(x=week, y=count)) +
  geom_point() +
  geom_line()

ggplot(gdata, aes(x=week, y=count)) +
  geom_point(size=2) + geom_line(linewidth=1.5) +
  scale_x_datetime(date_breaks = '6 months', date_labels='%m/%d/%y') +
  labs(x='Week of', y='Sales', title='Sales in Chicago by Week')

# sale price density

library(scales)

ggplot(ccao, aes(x=`Sale Price`)) +
  geom_density(fill='navy')

ggplot(ccao, aes(x=`Sale Price`)) +
  geom_density(fill='navy') +
  scale_x_log10(labels=dollar_format())

ggplot(ccao, aes(x=`Township Name`, y=`Sale Price`)) +
  geom_boxplot(fill='navy') +
  scale_y_log10(labels=dollar_format()) +
  coord_flip()