Example 05: puurr and problem set 4

Setup

Code
library(here)      # manage file paths
here() starts at /Users/kjhealy/Documents/courses/socdata.co
Code
library(socviz)    # data and some useful functions
library(tidyverse) # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(readxl)
options(dplyr.summarise.inform = FALSE)
library(conflicted)

conflicts_prefer(
  dplyr::filter,
  dplyr::lag
)
[conflicted] Will prefer dplyr::filter over any other package.
[conflicted] Will prefer dplyr::lag over any other package.

AK1.xlsx

Code
df_ak <- read_xlsx(here("projects", "04-problem-set", "data", "AK1.xlsx"), skip = 2) |> 
  janitor::clean_names() |> 
  rename(age = x1)
New names:
• `` -> `...1`
Code
df_ak
# A tibble: 102 × 7
   age          qx      lx     dx   lx_2       tx    ex
   <chr>     <dbl>   <dbl>  <dbl>  <dbl>    <dbl> <dbl>
 1 0–1   0.00513   100000  513.   99623. 7661992   76.6
 2 1–2   0.000308   99487.  30.6  99472. 7562368.  76.0
 3 2–3   0.0000992  99457.   9.87 99452. 7462896.  75.0
 4 3–4   0.000194   99447.  19.3  99437. 7363444.  74.0
 5 4–5   0.000192   99428.  19.1  99418. 7264007   73.1
 6 5–6   0.000288   99408.  28.6  99394. 7164589   72.1
 7 6–7   0.000328   99380.  32.6  99364. 7065195   71.1
 8 7–8   0.000348   99347.  34.6  99330. 6965832.  70.1
 9 8–9   0.000349   99313.  34.6  99295. 6866502.  69.1
10 9–10  0.000333   99278.  33.1  99261. 6767206   68.2
# ℹ 92 more rows
Code
headers <- read_xlsx(here("projects", "04-problem-set", "data", "AK1.xlsx"), skip = 1, n_max = 1) |> 
  colnames() |> 
  janitor::make_clean_names()
  

df_ak <- read_xlsx(here("projects", "04-problem-set", "data", "AK1.xlsx"), skip = 2) |> 
  janitor::clean_names() |> 
  rename(age = x1) 
New names:
• `` -> `...1`
Code
colnames(df_ak) <- headers

df_ak
# A tibble: 102 × 7
   age_years probability_of_dying_between_ages_x_and_x_1 number_surviving_to_a…¹
   <chr>                                           <dbl>                   <dbl>
 1 0–1                                         0.00513                   100000 
 2 1–2                                         0.000308                   99487.
 3 2–3                                         0.0000992                  99457.
 4 3–4                                         0.000194                   99447.
 5 4–5                                         0.000192                   99428.
 6 5–6                                         0.000288                   99408.
 7 6–7                                         0.000328                   99380.
 8 7–8                                         0.000348                   99347.
 9 8–9                                         0.000349                   99313.
10 9–10                                        0.000333                   99278.
# ℹ 92 more rows
# ℹ abbreviated name: ¹​number_surviving_to_age_x
# ℹ 4 more variables: number_dying_between_ages_x_and_x_1 <dbl>,
#   person_years_lived_between_ages_x_and_x_1 <dbl>,
#   total_number_of_person_years_lived_over_age_x <dbl>,
#   expectation_of_life_at_age_x <dbl>

countries.csv

Code
df_cc <- read_delim(here("projects", "04-problem-set", "data", "countries.csv")) |> 
  pivot_longer(cols = VarA2017:VarB2021, 
               names_to = c("var", "year"), 
               names_pattern = "Var([A|B])(\\d{4})") |> 
  janitor::clean_names()
Rows: 3 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country.Name
dbl (9): VarA2017, VarA2018, VarA2019, VarA2020, VarA2021, VarB2017, VarB201...
lgl (1): VarB2021

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
df_cc |> 
  print(n = Inf)
# A tibble: 30 × 4
   country_name   var   year     value
   <chr>          <chr> <chr>    <dbl>
 1 United Kingdom A     2017   2.70e12
 2 United Kingdom A     2018   2.90e12
 3 United Kingdom A     2019   2.88e12
 4 United Kingdom A     2020   2.76e12
 5 United Kingdom A     2021   3.19e12
 6 United Kingdom B     2017   3.51e 1
 7 United Kingdom B     2018  NA      
 8 United Kingdom B     2019  NA      
 9 United Kingdom B     2020  NA      
10 United Kingdom B     2021  NA      
11 United States  A     2017   1.95e13
12 United States  A     2018   2.05e13
13 United States  A     2019   2.14e13
14 United States  A     2020   2.09e13
15 United States  A     2021   2.30e13
16 United States  B     2017   4.12e 1
17 United States  B     2018   4.14e 1
18 United States  B     2019   4.15e 1
19 United States  B     2020  NA      
20 United States  B     2021  NA      
21 Mexico         A     2017   1.16e12
22 Mexico         A     2018   1.22e12
23 Mexico         A     2019   1.27e12
24 Mexico         A     2020   1.09e12
25 Mexico         A     2021   1.29e12
26 Mexico         B     2017  NA      
27 Mexico         B     2018   4.67e 1
28 Mexico         B     2019  NA      
29 Mexico         B     2020   4.54e 1
30 Mexico         B     2021  NA      

street_tree_planting.csv

Code
plant_date_fmt <- "%m/%d/%Y"
df_tree <-
  read_csv(
    here("projects", "04-problem-set", "data", "street_tree_planting.csv"),
    skip = 7,
    col_types = cols(
      lng = col_double(),
      lat = col_double(),
      Borough = col_character(),
      ZipCode = col_character(),
      BuildingNumber = col_character(),
      StreetName = col_character(),
      FiscalYear = col_integer(),
      PlantingSpaceID = col_character(),
      CommunityBoard = col_character(),
      PlantingSeason = col_date(format = plant_date_fmt),
      CityCouncil = col_character(),
      TreeID = col_character(),
      WOId = col_character(),
      WOStatus = col_character(),
      CompletedDate = col_datetime())) |>
      janitor::clean_names() 

df_tree
# A tibble: 9,295 × 15
     lng   lat borough   zip_code building_number street_name       fiscal_year
   <dbl> <dbl> <chr>     <chr>    <chr>           <chr>                   <int>
 1 -73.8  40.7 Queens    11436    147-08          116 AVENUE                  0
 2 -73.9  40.9 Manhattan 10033    29              CHITTENDEN AVENUE           0
 3 -73.9  40.8 Bronx     10472    1250            WARD AVENUE                 0
 4 -74.0  40.6 Brooklyn  11209    313             78 STREET                   0
 5 -74.0  40.6 Brooklyn  11219    6502            10 AVENUE                   0
 6 -74.0  40.6 Brooklyn  11219    1069            54 STREET                   0
 7 -74.0  40.6 Brooklyn  11219    1515            57 STREET                   0
 8 -73.9  40.6 Brooklyn  11210    1885            FLATBUSH AVENUE             0
 9 -74.0  40.7 Manhattan 10011    350             WEST 18 STREET              0
10 -73.9  40.8 Queens    11101    24-02           40 AVENUE                   0
# ℹ 9,285 more rows
# ℹ 8 more variables: planting_space_id <chr>, community_board <chr>,
#   planting_season <date>, city_council <chr>, tree_id <chr>, wo_id <chr>,
#   wo_status <chr>, completed_date <dttm>

table004.xlsx

Trying to do it in a “row-agnostic” manner (i.e. I don’t want to index any rows by their row number).

Code
df_tab4 <- read_xlsx(here("projects", "04-problem-set", "data", "table004.xlsx"), 
                     skip = 3,  
                     col_names = FALSE) |> 
  # ugh, have to search for literal double backslashes
  mutate(across(everything(), \(x) str_remove_all(x, "(\\\\)(\\d{1},?)"))) |> 
  # Tidy agegroups
  mutate(agegroup = !str_detect(...1, "At birth"),
         agegroup = ifelse(agegroup, NA, "At birth"),
         agegroup = ifelse(!str_detect(...1, "At 65 years"), agegroup, "At 65 years"),
         agegroup = ifelse(!str_detect(...1, "At 75 years"), agegroup, "At 75 years"),
         .before = everything()) |>
  fill(agegroup) |>
  # Cleanup
  filter(...2 != "Life expectancy (years)") |> 
  mutate(...1 = na_if(...1, "Specified age and year"),
         agegroup = ifelse(!str_detect(...1, "At 65 years"), agegroup, "At 65 years"),
         agegroup = ifelse(!str_detect(...1, "At 75 years"), agegroup, "At 75 years"),
         agegroup = ifelse(is.na(...1), "At birth", agegroup),
         .before = everything()) |> 
  fill(...1, .direction = "up") |> 
  # Get the race/ethnicity group names across the columns by repeatedly
  # pivoting and filling. A bit ugly.
  pivot_longer(...2:...4) |> 
  fill(value) |>
  pivot_wider(names_from = name, values_from = value) |> 
  pivot_longer(...5:...7) |> 
  fill(value) |> 
  pivot_wider(names_from = name, values_from = value) |> 
  pivot_longer(...8:...10) |> 
  fill(value) |> 
  pivot_wider(names_from = name, values_from = value) |> 
  # NOW replace the "- - -" NAs, but not before 
  mutate(
    ## NAify dashes and ellipsis 
    across(everything(), \(x) na_if(x, "- - -")),
    across(everything(), \(x) na_if(x, "…")),
    ## Merge the rownames into racevar:sexvar by grabbing from the previous
    ## row if and only if the cell doesn't begin with a digit AND its lag(1)
    ## doesn't begin with a digit
    across(...1:...10, \(x) if_else(!str_detect(x, "^\\d") & !str_detect(lag(x), "^\\d"), 
                                         paste0(lag(x),":", x), x)), 
    across(everything(), \(x) str_replace(x, "–", ""))) |> 
  # Drop the first row
  filter(!if_all(...2:...10, \(x) is.na(x))) |> 
  # Drop the superfluous middle row
  filter(str_detect(...2, "^\\d") | str_detect(...2, ":")) |> 
  # Create an indicator for the subtables
  mutate(subtable = !str_detect(...2, "\\d"), 
         .before = everything()) |> 
  mutate(subtable_n = seq(n()), .by = subtable, 
         subtable_n = if_else(!subtable, NA, subtable_n), 
         .before = everything()) |> 
  fill(subtable_n) |> 
  select(-subtable) |> 
  # The subtables will have different colnames so we nest them, add those
  # colnames (extracting them from the first rows of each subtable), and 
  # then drop the now-useless first row and pivot_longer to harmonize the
  # columns so we can unnest without loss or confusion
  nest(.by = subtable_n) |> 
  mutate(vnames = map(data, \(x) c("agegroup", "year", x[1,3:ncol(x)])),
         data = map2(data, vnames, \(x,y) set_names(x,y)), 
         # Drop the first row; superfluous now
         data = map(data, \(x) slice_tail(x, n = -1)),
         data = map(data, \(x) pivot_longer(x, cols = !all_of(c("agegroup", 
                                                                "year"))))) |> 
  select(-vnames) |> 
  unnest(data) |> 
  select(-subtable_n) |> 
  # Finally, split the name column and clean up
  separate(name, into = c("race", "sex"), sep = ":") |> 
  mutate(value = str_squish(value), 
         value = na_if(value, "NA"), 
         value = as.numeric(value))
New names:
• `` -> `...1`
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
Code
## Check for coercion
# which(is.na(as.numeric(df_tab4_tidy$value)) != is.na(df_tab4_tidy$value))


df_tab4 |> 
  slice_sample(n=20) |> 
  print(n=20)
# A tibble: 20 × 5
   agegroup    year  race                      sex        value
   <chr>       <chr> <chr>                     <chr>      <dbl>
 1 At 75 years 2001  All races                 Male         9.9
 2 At birth    1950  Black or African American Both sexes  60.8
 3 At 75 years 1987  All races                 Both sexes  10.7
 4 At birth    1950  All races                 Both sexes  68.2
 5 At 75 years 2012  All races                 Both sexes  12.2
 6 At 65 years 1993  White                     Both sexes  17.4
 7 At birth    1996  White                     Female      79.7
 8 At 65 years 1983  White                     Both sexes  16.8
 9 At 65 years 2010  White, not Hispanic       Female      20.3
10 At 65 years 2011  Black or African American Male        16.2
11 At birth    2016  Black or African American Male        72  
12 At 65 years 2005  Black or African American Male        15  
13 At birth    1986  All races                 Both sexes  74.7
14 At 65 years 1984  White                     Both sexes  16.8
15 At birth    1992  All races                 Male        72.3
16 At 75 years 2007  Hispanic                  Both sexes  13.1
17 At birth    1982  Black or African American Both sexes  69.4
18 At 65 years 1983  All races                 Female      18.6
19 At 75 years 1981  All races                 Male         9  
20 At birth    2006  Hispanic                  Both sexes  80.3

table004.xlsx, an elegant solution from Andrés

This approach splits the difference between single pipeline and being “row-agnostic”. Note the use of readr::parse_number()

Code
age_categories <- c("At birth", "At 65 years", "At 75 years")

race_categories <- c(
  "All races",
  "White",
  "Black or African American",
  "White, not Hispanic",
  "Black, not Hispanic",
  "Hispanic"
)
sex_categories <- c("Both sexes",   "Male", "Female")
race_rgx <- paste(race_categories, collapse = "|")
age_rgx <- paste(age_categories, collapse = "|")

# Raw Data ----------------------------------------------------------------

raw <- readxl::read_xlsx(here("projects", "04-problem-set", "data", "table004.xlsx"), col_names = FALSE)
New names:
• `` -> `...1`
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
Code
row_end <- detect_index(raw[[1]], \(x) grepl("- - -", x))
raw <- raw[1:(row_end - 1), ]

# Life + Race + Sex -------------------------------------------------------

index <- split(2:ncol(raw), rep(1:3, each = 3))

df_life_exp <- map(index, function(i) {

  temp <- raw[, i] |>
    rowid_to_column() |>
    set_names(c("rowid", sex_categories))

  temp$race <- str_extract(temp[[2]], race_rgx)

  temp <- fill(temp, race)

  temp |>
    mutate(across(all_of(sex_categories), readr::parse_number)) |>
    drop_na() |>
    pivot_longer(all_of(sex_categories), names_to = "sex", values_to = "life_exp")

}) |> bind_rows()
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `across(all_of(sex_categories), readr::parse_number)`.
Caused by warning:
! 5 parsing failures.
row col expected                  actual
  4  -- a number All races              
  5  -- a number Both sexes             
  6  -- a number Life expectancy (years)
136  -- a number Both sexes             
137  -- a number Life expectancy (years)
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `across(all_of(sex_categories), readr::parse_number)`.
Caused by warning:
! 5 parsing failures.
row col expected     actual
  5  -- a number Both sexes
 50  -- a number - - -     
 94  -- a number - - -     
134  -- a number - - -     
136  -- a number Both sexes
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `across(all_of(sex_categories), readr::parse_number)`.
Caused by warning:
! 8 parsing failures.
row col expected     actual
  5  -- a number Both sexes
 50  -- a number - - -     
 94  -- a number - - -     
134  -- a number - - -     
136  -- a number Both sexes
... ... ........ ..........
See problems(...) for more details.
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
Code
# Year + Age --------------------------------------------------------------

df_yr <- raw[1] |>
  set_names("year") |>
  mutate(age = str_extract(year, age_rgx)) |>
  fill(age) |>
  rowid_to_column() |>
  mutate(year = readr::parse_number(year)) |>
  mutate(year = ifelse(str_detect(year, "^\\d$"), NA, year))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `year = readr::parse_number(year)`.
Caused by warning:
! 6 parsing failures.
row col expected                                                                                                                           actual
  2  -- a number Excel version (with more data years and standard errors when available): https://www.cdc.gov/nchs/hus/contents2019.htm#Table-004
  3  -- a number [Data are based on death certificates]                                                                                          
  4  -- a number Specified age and year                                                                                                          
  6  -- a number At birth                                                                                                                        
135  -- a number Specified age and year                                                                                                          
... ... ........ ................................................................................................................................
See problems(...) for more details.
Code
df_yr$single_race <- as.integer(str_detect(raw[[1]], "single race"))

df_complete <- df_yr |>
  drop_na() |>
  right_join(df_life_exp)
Joining with `by = join_by(rowid)`
Code
df_complete
# A tibble: 1,485 × 7
   rowid  year age      single_race race                      sex       life_exp
   <int> <dbl> <chr>          <int> <chr>                     <chr>        <dbl>
 1     7  1900 At birth           0 All races                 Both sex…     47.3
 2     7  1900 At birth           0 All races                 Male          46.3
 3     7  1900 At birth           0 All races                 Female        48.3
 4     7  1900 At birth           0 White                     Both sex…     47.6
 5     7  1900 At birth           0 White                     Male          46.6
 6     7  1900 At birth           0 White                     Female        48.7
 7     7  1900 At birth           0 Black or African American Both sex…     33  
 8     7  1900 At birth           0 Black or African American Male          32.5
 9     7  1900 At birth           0 Black or African American Female        33.5
10     8  1950 At birth           0 All races                 Both sex…     68.2
# ℹ 1,475 more rows