── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Rows: 3 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country.Name
dbl (9): VarA2017, VarA2018, VarA2019, VarA2020, VarA2021, VarB2017, VarB201...
lgl (1): VarB2021
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
df_cc |>print(n =Inf)
# A tibble: 30 × 4
country_name var year value
<chr> <chr> <chr> <dbl>
1 United Kingdom A 2017 2.70e12
2 United Kingdom A 2018 2.90e12
3 United Kingdom A 2019 2.88e12
4 United Kingdom A 2020 2.76e12
5 United Kingdom A 2021 3.19e12
6 United Kingdom B 2017 3.51e 1
7 United Kingdom B 2018 NA
8 United Kingdom B 2019 NA
9 United Kingdom B 2020 NA
10 United Kingdom B 2021 NA
11 United States A 2017 1.95e13
12 United States A 2018 2.05e13
13 United States A 2019 2.14e13
14 United States A 2020 2.09e13
15 United States A 2021 2.30e13
16 United States B 2017 4.12e 1
17 United States B 2018 4.14e 1
18 United States B 2019 4.15e 1
19 United States B 2020 NA
20 United States B 2021 NA
21 Mexico A 2017 1.16e12
22 Mexico A 2018 1.22e12
23 Mexico A 2019 1.27e12
24 Mexico A 2020 1.09e12
25 Mexico A 2021 1.29e12
26 Mexico B 2017 NA
27 Mexico B 2018 4.67e 1
28 Mexico B 2019 NA
29 Mexico B 2020 4.54e 1
30 Mexico B 2021 NA
Trying to do it in a “row-agnostic” manner (i.e. I don’t want to index any rows by their row number).
Code
df_tab4 <-read_xlsx(here("projects", "04-problem-set", "data", "table004.xlsx"), skip =3, col_names =FALSE) |># ugh, have to search for literal double backslashesmutate(across(everything(), \(x) str_remove_all(x, "(\\\\)(\\d{1},?)"))) |># Tidy agegroupsmutate(agegroup =!str_detect(...1, "At birth"),agegroup =ifelse(agegroup, NA, "At birth"),agegroup =ifelse(!str_detect(...1, "At 65 years"), agegroup, "At 65 years"),agegroup =ifelse(!str_detect(...1, "At 75 years"), agegroup, "At 75 years"),.before =everything()) |>fill(agegroup) |># Cleanupfilter(...2!="Life expectancy (years)") |>mutate(...1 =na_if(...1, "Specified age and year"),agegroup =ifelse(!str_detect(...1, "At 65 years"), agegroup, "At 65 years"),agegroup =ifelse(!str_detect(...1, "At 75 years"), agegroup, "At 75 years"),agegroup =ifelse(is.na(...1), "At birth", agegroup),.before =everything()) |>fill(...1, .direction ="up") |># Get the race/ethnicity group names across the columns by repeatedly# pivoting and filling. A bit ugly.pivot_longer(...2:...4) |>fill(value) |>pivot_wider(names_from = name, values_from = value) |>pivot_longer(...5:...7) |>fill(value) |>pivot_wider(names_from = name, values_from = value) |>pivot_longer(...8:...10) |>fill(value) |>pivot_wider(names_from = name, values_from = value) |># NOW replace the "- - -" NAs, but not before mutate(## NAify dashes and ellipsis across(everything(), \(x) na_if(x, "- - -")),across(everything(), \(x) na_if(x, "…")),## Merge the rownames into racevar:sexvar by grabbing from the previous## row if and only if the cell doesn't begin with a digit AND its lag(1)## doesn't begin with a digitacross(...1:...10, \(x) if_else(!str_detect(x, "^\\d") &!str_detect(lag(x), "^\\d"), paste0(lag(x),":", x), x)), across(everything(), \(x) str_replace(x, "–", ""))) |># Drop the first rowfilter(!if_all(...2:...10, \(x) is.na(x))) |># Drop the superfluous middle rowfilter(str_detect(...2, "^\\d") |str_detect(...2, ":")) |># Create an indicator for the subtablesmutate(subtable =!str_detect(...2, "\\d"), .before =everything()) |>mutate(subtable_n =seq(n()), .by = subtable, subtable_n =if_else(!subtable, NA, subtable_n), .before =everything()) |>fill(subtable_n) |>select(-subtable) |># The subtables will have different colnames so we nest them, add those# colnames (extracting them from the first rows of each subtable), and # then drop the now-useless first row and pivot_longer to harmonize the# columns so we can unnest without loss or confusionnest(.by = subtable_n) |>mutate(vnames =map(data, \(x) c("agegroup", "year", x[1,3:ncol(x)])),data =map2(data, vnames, \(x,y) set_names(x,y)), # Drop the first row; superfluous nowdata =map(data, \(x) slice_tail(x, n =-1)),data =map(data, \(x) pivot_longer(x, cols =!all_of(c("agegroup", "year"))))) |>select(-vnames) |>unnest(data) |>select(-subtable_n) |># Finally, split the name column and clean upseparate(name, into =c("race", "sex"), sep =":") |>mutate(value =str_squish(value), value =na_if(value, "NA"), value =as.numeric(value))
## Check for coercion# which(is.na(as.numeric(df_tab4_tidy$value)) != is.na(df_tab4_tidy$value))df_tab4 |>slice_sample(n=20) |>print(n=20)
# A tibble: 20 × 5
agegroup year race sex value
<chr> <chr> <chr> <chr> <dbl>
1 At 75 years 2001 All races Male 9.9
2 At birth 1950 Black or African American Both sexes 60.8
3 At 75 years 1987 All races Both sexes 10.7
4 At birth 1950 All races Both sexes 68.2
5 At 75 years 2012 All races Both sexes 12.2
6 At 65 years 1993 White Both sexes 17.4
7 At birth 1996 White Female 79.7
8 At 65 years 1983 White Both sexes 16.8
9 At 65 years 2010 White, not Hispanic Female 20.3
10 At 65 years 2011 Black or African American Male 16.2
11 At birth 2016 Black or African American Male 72
12 At 65 years 2005 Black or African American Male 15
13 At birth 1986 All races Both sexes 74.7
14 At 65 years 1984 White Both sexes 16.8
15 At birth 1992 All races Male 72.3
16 At 75 years 2007 Hispanic Both sexes 13.1
17 At birth 1982 Black or African American Both sexes 69.4
18 At 65 years 1983 All races Female 18.6
19 At 75 years 1981 All races Male 9
20 At birth 2006 Hispanic Both sexes 80.3
table004.xlsx, an elegant solution from Andrés
This approach splits the difference between single pipeline and being “row-agnostic”. Note the use of readr::parse_number()
Code
age_categories <-c("At birth", "At 65 years", "At 75 years")race_categories <-c("All races","White","Black or African American","White, not Hispanic","Black, not Hispanic","Hispanic")sex_categories <-c("Both sexes", "Male", "Female")race_rgx <-paste(race_categories, collapse ="|")age_rgx <-paste(age_categories, collapse ="|")# Raw Data ----------------------------------------------------------------raw <- readxl::read_xlsx(here("projects", "04-problem-set", "data", "table004.xlsx"), col_names =FALSE)
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `across(all_of(sex_categories), readr::parse_number)`.
Caused by warning:
! 5 parsing failures.
row col expected actual
4 -- a number All races
5 -- a number Both sexes
6 -- a number Life expectancy (years)
136 -- a number Both sexes
137 -- a number Life expectancy (years)
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `across(all_of(sex_categories), readr::parse_number)`.
Caused by warning:
! 5 parsing failures.
row col expected actual
5 -- a number Both sexes
50 -- a number - - -
94 -- a number - - -
134 -- a number - - -
136 -- a number Both sexes
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `across(all_of(sex_categories), readr::parse_number)`.
Caused by warning:
! 8 parsing failures.
row col expected actual
5 -- a number Both sexes
50 -- a number - - -
94 -- a number - - -
134 -- a number - - -
136 -- a number Both sexes
... ... ........ ..........
See problems(...) for more details.
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `year = readr::parse_number(year)`.
Caused by warning:
! 6 parsing failures.
row col expected actual
2 -- a number Excel version (with more data years and standard errors when available): https://www.cdc.gov/nchs/hus/contents2019.htm#Table-004
3 -- a number [Data are based on death certificates]
4 -- a number Specified age and year
6 -- a number At birth
135 -- a number Specified age and year
... ... ........ ................................................................................................................................
See problems(...) for more details.
# A tibble: 1,485 × 7
rowid year age single_race race sex life_exp
<int> <dbl> <chr> <int> <chr> <chr> <dbl>
1 7 1900 At birth 0 All races Both sex… 47.3
2 7 1900 At birth 0 All races Male 46.3
3 7 1900 At birth 0 All races Female 48.3
4 7 1900 At birth 0 White Both sex… 47.6
5 7 1900 At birth 0 White Male 46.6
6 7 1900 At birth 0 White Female 48.7
7 7 1900 At birth 0 Black or African American Both sex… 33
8 7 1900 At birth 0 Black or African American Male 32.5
9 7 1900 At birth 0 Black or African American Female 33.5
10 8 1950 At birth 0 All races Both sex… 68.2
# ℹ 1,475 more rows
Source Code
---title: "Example 05: puurr and problem set 4"---## Setup```{r}#| label: "dplyr-basics-2"library(here) # manage file pathslibrary(socviz) # data and some useful functionslibrary(tidyverse) # your friend and minelibrary(readxl)options(dplyr.summarise.inform =FALSE)library(conflicted)conflicts_prefer( dplyr::filter, dplyr::lag)```# `AK1.xlsx````{r}df_ak <-read_xlsx(here("projects", "04-problem-set", "data", "AK1.xlsx"), skip =2) |> janitor::clean_names() |>rename(age = x1)df_ak``````{r}headers <-read_xlsx(here("projects", "04-problem-set", "data", "AK1.xlsx"), skip =1, n_max =1) |>colnames() |> janitor::make_clean_names()df_ak <-read_xlsx(here("projects", "04-problem-set", "data", "AK1.xlsx"), skip =2) |> janitor::clean_names() |>rename(age = x1) colnames(df_ak) <- headersdf_ak```# `countries.csv````{r}df_cc <-read_delim(here("projects", "04-problem-set", "data", "countries.csv")) |>pivot_longer(cols = VarA2017:VarB2021, names_to =c("var", "year"), names_pattern ="Var([A|B])(\\d{4})") |> janitor::clean_names()df_cc |>print(n =Inf)```# `street_tree_planting.csv````{r}plant_date_fmt <-"%m/%d/%Y"df_tree <-read_csv(here("projects", "04-problem-set", "data", "street_tree_planting.csv"),skip =7,col_types =cols(lng =col_double(),lat =col_double(),Borough =col_character(),ZipCode =col_character(),BuildingNumber =col_character(),StreetName =col_character(),FiscalYear =col_integer(),PlantingSpaceID =col_character(),CommunityBoard =col_character(),PlantingSeason =col_date(format = plant_date_fmt),CityCouncil =col_character(),TreeID =col_character(),WOId =col_character(),WOStatus =col_character(),CompletedDate =col_datetime())) |> janitor::clean_names() df_tree```# `table004.xlsx`Trying to do it in a "row-agnostic" manner (i.e. I don't want to index any rows by their row number).```{r}df_tab4 <-read_xlsx(here("projects", "04-problem-set", "data", "table004.xlsx"), skip =3, col_names =FALSE) |># ugh, have to search for literal double backslashesmutate(across(everything(), \(x) str_remove_all(x, "(\\\\)(\\d{1},?)"))) |># Tidy agegroupsmutate(agegroup =!str_detect(...1, "At birth"),agegroup =ifelse(agegroup, NA, "At birth"),agegroup =ifelse(!str_detect(...1, "At 65 years"), agegroup, "At 65 years"),agegroup =ifelse(!str_detect(...1, "At 75 years"), agegroup, "At 75 years"),.before =everything()) |>fill(agegroup) |># Cleanupfilter(...2!="Life expectancy (years)") |>mutate(...1 =na_if(...1, "Specified age and year"),agegroup =ifelse(!str_detect(...1, "At 65 years"), agegroup, "At 65 years"),agegroup =ifelse(!str_detect(...1, "At 75 years"), agegroup, "At 75 years"),agegroup =ifelse(is.na(...1), "At birth", agegroup),.before =everything()) |>fill(...1, .direction ="up") |># Get the race/ethnicity group names across the columns by repeatedly# pivoting and filling. A bit ugly.pivot_longer(...2:...4) |>fill(value) |>pivot_wider(names_from = name, values_from = value) |>pivot_longer(...5:...7) |>fill(value) |>pivot_wider(names_from = name, values_from = value) |>pivot_longer(...8:...10) |>fill(value) |>pivot_wider(names_from = name, values_from = value) |># NOW replace the "- - -" NAs, but not before mutate(## NAify dashes and ellipsis across(everything(), \(x) na_if(x, "- - -")),across(everything(), \(x) na_if(x, "…")),## Merge the rownames into racevar:sexvar by grabbing from the previous## row if and only if the cell doesn't begin with a digit AND its lag(1)## doesn't begin with a digitacross(...1:...10, \(x) if_else(!str_detect(x, "^\\d") &!str_detect(lag(x), "^\\d"), paste0(lag(x),":", x), x)), across(everything(), \(x) str_replace(x, "–", ""))) |># Drop the first rowfilter(!if_all(...2:...10, \(x) is.na(x))) |># Drop the superfluous middle rowfilter(str_detect(...2, "^\\d") |str_detect(...2, ":")) |># Create an indicator for the subtablesmutate(subtable =!str_detect(...2, "\\d"), .before =everything()) |>mutate(subtable_n =seq(n()), .by = subtable, subtable_n =if_else(!subtable, NA, subtable_n), .before =everything()) |>fill(subtable_n) |>select(-subtable) |># The subtables will have different colnames so we nest them, add those# colnames (extracting them from the first rows of each subtable), and # then drop the now-useless first row and pivot_longer to harmonize the# columns so we can unnest without loss or confusionnest(.by = subtable_n) |>mutate(vnames =map(data, \(x) c("agegroup", "year", x[1,3:ncol(x)])),data =map2(data, vnames, \(x,y) set_names(x,y)), # Drop the first row; superfluous nowdata =map(data, \(x) slice_tail(x, n =-1)),data =map(data, \(x) pivot_longer(x, cols =!all_of(c("agegroup", "year"))))) |>select(-vnames) |>unnest(data) |>select(-subtable_n) |># Finally, split the name column and clean upseparate(name, into =c("race", "sex"), sep =":") |>mutate(value =str_squish(value), value =na_if(value, "NA"), value =as.numeric(value))## Check for coercion# which(is.na(as.numeric(df_tab4_tidy$value)) != is.na(df_tab4_tidy$value))df_tab4 |>slice_sample(n=20) |>print(n=20)```# `table004.xlsx`, an elegant solution from Andrés This approach splits the difference between single pipeline and being "row-agnostic". Note the use of `readr::parse_number()````{r}age_categories <-c("At birth", "At 65 years", "At 75 years")race_categories <-c("All races","White","Black or African American","White, not Hispanic","Black, not Hispanic","Hispanic")sex_categories <-c("Both sexes", "Male", "Female")race_rgx <-paste(race_categories, collapse ="|")age_rgx <-paste(age_categories, collapse ="|")# Raw Data ----------------------------------------------------------------raw <- readxl::read_xlsx(here("projects", "04-problem-set", "data", "table004.xlsx"), col_names =FALSE)row_end <-detect_index(raw[[1]], \(x) grepl("- - -", x))raw <- raw[1:(row_end -1), ]# Life + Race + Sex -------------------------------------------------------index <-split(2:ncol(raw), rep(1:3, each =3))df_life_exp <-map(index, function(i) { temp <- raw[, i] |>rowid_to_column() |>set_names(c("rowid", sex_categories)) temp$race <-str_extract(temp[[2]], race_rgx) temp <-fill(temp, race) temp |>mutate(across(all_of(sex_categories), readr::parse_number)) |>drop_na() |>pivot_longer(all_of(sex_categories), names_to ="sex", values_to ="life_exp")}) |>bind_rows()# Year + Age --------------------------------------------------------------df_yr <- raw[1] |>set_names("year") |>mutate(age =str_extract(year, age_rgx)) |>fill(age) |>rowid_to_column() |>mutate(year = readr::parse_number(year)) |>mutate(year =ifelse(str_detect(year, "^\\d$"), NA, year))df_yr$single_race <-as.integer(str_detect(raw[[1]], "single race"))df_complete <- df_yr |>drop_na() |>right_join(df_life_exp)df_complete```