Example 02: The basics of dplyr

Setup

Code
library(here)      # manage file paths
here() starts at /Users/kjhealy/Documents/courses/socdata.co
Code
library(socviz)    # data and some useful functions
library(tidyverse) # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Crosstabs

Code
## library(socviz) # if not loaded
gss_sm
# A tibble: 2,867 × 32
    year    id ballot       age childs sibs   degree race  sex   region income16
   <dbl> <dbl> <labelled> <dbl>  <dbl> <labe> <fct>  <fct> <fct> <fct>  <fct>   
 1  2016     1 1             47      3 2      Bache… White Male  New E… $170000…
 2  2016     2 2             61      0 3      High … White Male  New E… $50000 …
 3  2016     3 3             72      2 3      Bache… White Male  New E… $75000 …
 4  2016     4 1             43      4 3      High … White Fema… New E… $170000…
 5  2016     5 3             55      2 2      Gradu… White Fema… New E… $170000…
 6  2016     6 2             53      2 2      Junio… White Fema… New E… $60000 …
 7  2016     7 1             50      2 2      High … White Male  New E… $170000…
 8  2016     8 3             23      3 6      High … Other Fema… Middl… $30000 …
 9  2016     9 1             45      3 5      High … Black Male  Middl… $60000 …
10  2016    10 3             71      4 1      Junio… White Male  Middl… $60000 …
# ℹ 2,857 more rows
# ℹ 21 more variables: relig <fct>, marital <fct>, padeg <fct>, madeg <fct>,
#   partyid <fct>, polviews <fct>, happy <fct>, partners <fct>, grass <fct>,
#   zodiac <fct>, pres12 <labelled>, wtssall <dbl>, income_rc <fct>,
#   agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>, religion <fct>,
#   bigregion <fct>, partners_rc <fct>, obama <dbl>
Code
gss_sm |>
  select(id, bigregion, religion)
# A tibble: 2,867 × 3
      id bigregion religion  
   <dbl> <fct>     <fct>     
 1     1 Northeast None      
 2     2 Northeast None      
 3     3 Northeast Catholic  
 4     4 Northeast Catholic  
 5     5 Northeast None      
 6     6 Northeast None      
 7     7 Northeast None      
 8     8 Northeast Catholic  
 9     9 Northeast Protestant
10    10 Northeast None      
# ℹ 2,857 more rows
Code
gss_sm |>
  group_by(bigregion)
# A tibble: 2,867 × 32
# Groups:   bigregion [4]
    year    id ballot       age childs sibs   degree race  sex   region income16
   <dbl> <dbl> <labelled> <dbl>  <dbl> <labe> <fct>  <fct> <fct> <fct>  <fct>   
 1  2016     1 1             47      3 2      Bache… White Male  New E… $170000…
 2  2016     2 2             61      0 3      High … White Male  New E… $50000 …
 3  2016     3 3             72      2 3      Bache… White Male  New E… $75000 …
 4  2016     4 1             43      4 3      High … White Fema… New E… $170000…
 5  2016     5 3             55      2 2      Gradu… White Fema… New E… $170000…
 6  2016     6 2             53      2 2      Junio… White Fema… New E… $60000 …
 7  2016     7 1             50      2 2      High … White Male  New E… $170000…
 8  2016     8 3             23      3 6      High … Other Fema… Middl… $30000 …
 9  2016     9 1             45      3 5      High … Black Male  Middl… $60000 …
10  2016    10 3             71      4 1      Junio… White Male  Middl… $60000 …
# ℹ 2,857 more rows
# ℹ 21 more variables: relig <fct>, marital <fct>, padeg <fct>, madeg <fct>,
#   partyid <fct>, polviews <fct>, happy <fct>, partners <fct>, grass <fct>,
#   zodiac <fct>, pres12 <labelled>, wtssall <dbl>, income_rc <fct>,
#   agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>, religion <fct>,
#   bigregion <fct>, partners_rc <fct>, obama <dbl>
Code
gss_sm |>
  group_by(bigregion, religion) |> 
  summarize(total = n()) |>
  mutate(freq = total / sum(total),
           pct = round((freq*100), 1))
`summarise()` has grouped output by 'bigregion'. You can override using the
`.groups` argument.
# A tibble: 24 × 5
# Groups:   bigregion [4]
   bigregion religion   total    freq   pct
   <fct>     <fct>      <int>   <dbl> <dbl>
 1 Northeast Protestant   158 0.324    32.4
 2 Northeast Catholic     162 0.332    33.2
 3 Northeast Jewish        27 0.0553    5.5
 4 Northeast None         112 0.230    23  
 5 Northeast Other         28 0.0574    5.7
 6 Northeast <NA>           1 0.00205   0.2
 7 Midwest   Protestant   325 0.468    46.8
 8 Midwest   Catholic     172 0.247    24.7
 9 Midwest   Jewish         3 0.00432   0.4
10 Midwest   None         157 0.226    22.6
# ℹ 14 more rows

dplyr will keep informing us about what summarize() is grouping by; we can tell it to shut up:

Code
## Silence. With an 's'!
options(dplyr.summarise.inform = FALSE)
Code
gss_sm |>
  group_by(bigregion, religion) |>
  summarize(total = n()) |>
  mutate(freq = total / sum(total),
           pct = round((freq*100), 1)) 
# A tibble: 24 × 5
# Groups:   bigregion [4]
   bigregion religion   total    freq   pct
   <fct>     <fct>      <int>   <dbl> <dbl>
 1 Northeast Protestant   158 0.324    32.4
 2 Northeast Catholic     162 0.332    33.2
 3 Northeast Jewish        27 0.0553    5.5
 4 Northeast None         112 0.230    23  
 5 Northeast Other         28 0.0574    5.7
 6 Northeast <NA>           1 0.00205   0.2
 7 Midwest   Protestant   325 0.468    46.8
 8 Midwest   Catholic     172 0.247    24.7
 9 Midwest   Jewish         3 0.00432   0.4
10 Midwest   None         157 0.226    22.6
# ℹ 14 more rows

Ways to summarize

Code
gss_sm |>
  group_by(bigregion, religion) |> 
  summarize(n = n()) 
# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# ℹ 14 more rows
Code
gss_sm |>
  group_by(bigregion, religion) |>
  tally() 
# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# ℹ 14 more rows
Code
gss_sm |>
  count(bigregion, religion) 
# A tibble: 24 × 3
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# ℹ 14 more rows

Feed results forward

Code
gss_sm |>
  count(bigregion, religion) |>
  pivot_wider(names_from = bigregion, values_from = n) |>
  knitr::kable()
religion Northeast Midwest South West
Protestant 158 325 650 238
Catholic 162 172 160 155
Jewish 27 3 11 10
None 112 157 170 180
Other 28 33 50 48
NA 1 5 11 1
Code
gss_sm |>
  group_by(bigregion, religion) |>
  tally() |>
  mutate(pct = round((n/sum(n))*100), 1) |>
  drop_na() |>
  ggplot(mapping = aes(x = pct, y = reorder(religion, -pct), fill = religion)) + 
  geom_col() + 
    labs(x = "Percent", y = NULL) +
    guides(fill = "none") +
    facet_wrap(~ bigregion, nrow = 1)

Left and right assignment

Code
rel_by_region <- gss_sm |> 
  count(bigregion, religion) |>
  mutate(pct = round((n/sum(n))*100, 1))

rel_by_region
# A tibble: 24 × 4
   bigregion religion       n   pct
   <fct>     <fct>      <int> <dbl>
 1 Northeast Protestant   158   5.5
 2 Northeast Catholic     162   5.7
 3 Northeast Jewish        27   0.9
 4 Northeast None         112   3.9
 5 Northeast Other         28   1  
 6 Northeast <NA>           1   0  
 7 Midwest   Protestant   325  11.3
 8 Midwest   Catholic     172   6  
 9 Midwest   Jewish         3   0.1
10 Midwest   None         157   5.5
# ℹ 14 more rows
Code
gss_sm |>
  count(bigregion, religion) |>
  mutate(pct = round((n/sum(n))*100, 1)) -> 
rel_by_region 

rel_by_region
# A tibble: 24 × 4
   bigregion religion       n   pct
   <fct>     <fct>      <int> <dbl>
 1 Northeast Protestant   158   5.5
 2 Northeast Catholic     162   5.7
 3 Northeast Jewish        27   0.9
 4 Northeast None         112   3.9
 5 Northeast Other         28   1  
 6 Northeast <NA>           1   0  
 7 Midwest   Protestant   325  11.3
 8 Midwest   Catholic     172   6  
 9 Midwest   Jewish         3   0.1
10 Midwest   None         157   5.5
# ℹ 14 more rows
Code
gss_tab <- gss_sm |>
  count(bigregion, religion)
Code
gss_sm |>
  count(bigregion, religion) -> gss_tab

Check your tables

Code
rel_by_region <- gss_sm |>
  count(bigregion, religion) |>
  mutate(pct = round((n/sum(n))*100, 1))

rel_by_region
# A tibble: 24 × 4
   bigregion religion       n   pct
   <fct>     <fct>      <int> <dbl>
 1 Northeast Protestant   158   5.5
 2 Northeast Catholic     162   5.7
 3 Northeast Jewish        27   0.9
 4 Northeast None         112   3.9
 5 Northeast Other         28   1  
 6 Northeast <NA>           1   0  
 7 Midwest   Protestant   325  11.3
 8 Midwest   Catholic     172   6  
 9 Midwest   Jewish         3   0.1
10 Midwest   None         157   5.5
# ℹ 14 more rows
Code
## Each region should sum to ~100
rel_by_region |>
  group_by(bigregion) |>
  summarize(total = sum(pct))
# A tibble: 4 × 2
  bigregion total
  <fct>     <dbl>
1 Northeast  17  
2 Midwest    24.3
3 South      36.7
4 West       22  
Code
rel_by_region <- gss_sm |>
  count(bigregion, religion) |> 
  mutate(pct = round((n/sum(n))*100, 1))
Code
rel_by_region |>
  summarize(total = sum(pct))
# A tibble: 1 × 1
  total
  <dbl>
1   100
Code
rel_by_region <- gss_sm |>
  count(bigregion, religion) |> 
  mutate(pct = round((n/sum(n))*100, 1))
Code
rel_by_region |>
  summarize(total = sum(pct))
# A tibble: 1 × 1
  total
  <dbl>
1   100
Code
rel_by_region <- gss_sm |>
  group_by(bigregion, religion) |> 
  tally() |> 
  mutate(pct = round((n/sum(n))*100, 1))
Code
# Check
rel_by_region |>
  group_by(bigregion) |>
  summarize(total = sum(pct))
# A tibble: 4 × 2
  bigregion total
  <fct>     <dbl>
1 Northeast 100  
2 Midwest    99.9
3 South     100  
4 West      100. 
Code
gss_sm |>
  group_by(race, sex, degree) |>
  summarize(n = n(),
            mean_age = mean(age, na.rm = TRUE),
            mean_kids = mean(childs, na.rm = TRUE)) |>
  mutate(pct = n/sum(n)*100) |>
  filter(race !="Other") |>
  # Be careful with drop_na()
  drop_na() |>
  ggplot(mapping = aes(x = mean_kids, y = degree)) + # Some ggplot ...
  geom_col() + facet_grid(sex ~ race) +
  labs(x = "Average number of Children", y = NULL)

Filtering and selection

Code
# library(socviz)
organdata
# A tibble: 238 × 21
   country   year       donors   pop pop_dens   gdp gdp_lag health health_lag
   <chr>     <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
 1 Australia NA          NA    17065    0.220 16774   16591   1300       1224
 2 Australia 1991-01-01  12.1  17284    0.223 17171   16774   1379       1300
 3 Australia 1992-01-01  12.4  17495    0.226 17914   17171   1455       1379
 4 Australia 1993-01-01  12.5  17667    0.228 18883   17914   1540       1455
 5 Australia 1994-01-01  10.2  17855    0.231 19849   18883   1626       1540
 6 Australia 1995-01-01  10.2  18072    0.233 21079   19849   1737       1626
 7 Australia 1996-01-01  10.6  18311    0.237 21923   21079   1846       1737
 8 Australia 1997-01-01  10.3  18518    0.239 22961   21923   1948       1846
 9 Australia 1998-01-01  10.5  18711    0.242 24148   22961   2077       1948
10 Australia 1999-01-01   8.67 18926    0.244 25445   24148   2231       2077
# ℹ 228 more rows
# ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
#   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
Code
organdata |>
  filter(consent_law == "Informed" & donors > 15)
# A tibble: 30 × 21
   country year       donors   pop pop_dens   gdp gdp_lag health health_lag
   <chr>   <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
 1 Canada  2000-01-01   15.3 30770    0.309 28472   26658   2541       2400
 2 Denmark 1992-01-01   16.1  5171   12.0   19644   19126   1660       1603
 3 Ireland 1991-01-01   19    3534    5.03  13495   12917    884        791
 4 Ireland 1992-01-01   19.5  3558    5.06  14241   13495   1005        884
 5 Ireland 1993-01-01   17.1  3576    5.09  14927   14241   1041       1005
 6 Ireland 1994-01-01   20.3  3590    5.11  15990   14927   1119       1041
 7 Ireland 1995-01-01   24.6  3609    5.14  17789   15990   1208       1119
 8 Ireland 1996-01-01   16.8  3636    5.17  19245   17789   1269       1208
 9 Ireland 1997-01-01   20.9  3673    5.23  22017   19245   1417       1269
10 Ireland 1998-01-01   23.8  3715    5.29  23995   22017   1487       1417
# ℹ 20 more rows
# ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
#   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
Code
organdata |>
  select(country, year, where(is.integer)) 
# A tibble: 238 × 8
   country   year         pop   gdp gdp_lag cerebvas assault external
   <chr>     <date>     <int> <int>   <int>    <int>   <int>    <int>
 1 Australia NA         17065 16774   16591      682      21      444
 2 Australia 1991-01-01 17284 17171   16774      647      19      425
 3 Australia 1992-01-01 17495 17914   17171      630      17      406
 4 Australia 1993-01-01 17667 18883   17914      611      18      376
 5 Australia 1994-01-01 17855 19849   18883      631      17      387
 6 Australia 1995-01-01 18072 21079   19849      592      16      371
 7 Australia 1996-01-01 18311 21923   21079      576      17      395
 8 Australia 1997-01-01 18518 22961   21923      525      17      385
 9 Australia 1998-01-01 18711 24148   22961      516      16      410
10 Australia 1999-01-01 18926 25445   24148      493      15      409
# ℹ 228 more rows
Code
organdata |>
  select(country, year, where(is.character))
# A tibble: 238 × 8
   country  year       world opt   consent_law consent_practice consistent ccode
   <chr>    <date>     <chr> <chr> <chr>       <chr>            <chr>      <chr>
 1 Austral… NA         Libe… In    Informed    Informed         Yes        Oz   
 2 Austral… 1991-01-01 Libe… In    Informed    Informed         Yes        Oz   
 3 Austral… 1992-01-01 Libe… In    Informed    Informed         Yes        Oz   
 4 Austral… 1993-01-01 Libe… In    Informed    Informed         Yes        Oz   
 5 Austral… 1994-01-01 Libe… In    Informed    Informed         Yes        Oz   
 6 Austral… 1995-01-01 Libe… In    Informed    Informed         Yes        Oz   
 7 Austral… 1996-01-01 Libe… In    Informed    Informed         Yes        Oz   
 8 Austral… 1997-01-01 Libe… In    Informed    Informed         Yes        Oz   
 9 Austral… 1998-01-01 Libe… In    Informed    Informed         Yes        Oz   
10 Austral… 1999-01-01 Libe… In    Informed    Informed         Yes        Oz   
# ℹ 228 more rows
Code
organdata |>
  select(country, year, starts_with("gdp")) 
# A tibble: 238 × 4
   country   year         gdp gdp_lag
   <chr>     <date>     <int>   <int>
 1 Australia NA         16774   16591
 2 Australia 1991-01-01 17171   16774
 3 Australia 1992-01-01 17914   17171
 4 Australia 1993-01-01 18883   17914
 5 Australia 1994-01-01 19849   18883
 6 Australia 1995-01-01 21079   19849
 7 Australia 1996-01-01 21923   21079
 8 Australia 1997-01-01 22961   21923
 9 Australia 1998-01-01 24148   22961
10 Australia 1999-01-01 25445   24148
# ℹ 228 more rows
Code
organdata |>
  filter(country == "Australia" | country == "Canada")
# A tibble: 28 × 21
   country   year       donors   pop pop_dens   gdp gdp_lag health health_lag
   <chr>     <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
 1 Australia NA          NA    17065    0.220 16774   16591   1300       1224
 2 Australia 1991-01-01  12.1  17284    0.223 17171   16774   1379       1300
 3 Australia 1992-01-01  12.4  17495    0.226 17914   17171   1455       1379
 4 Australia 1993-01-01  12.5  17667    0.228 18883   17914   1540       1455
 5 Australia 1994-01-01  10.2  17855    0.231 19849   18883   1626       1540
 6 Australia 1995-01-01  10.2  18072    0.233 21079   19849   1737       1626
 7 Australia 1996-01-01  10.6  18311    0.237 21923   21079   1846       1737
 8 Australia 1997-01-01  10.3  18518    0.239 22961   21923   1948       1846
 9 Australia 1998-01-01  10.5  18711    0.242 24148   22961   2077       1948
10 Australia 1999-01-01   8.67 18926    0.244 25445   24148   2231       2077
# ℹ 18 more rows
# ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
#   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
Code
my_countries <- c("Australia", "Canada", "United States", "Ireland")

organdata |>
  filter(country %in% my_countries) 
# A tibble: 56 × 21
   country   year       donors   pop pop_dens   gdp gdp_lag health health_lag
   <chr>     <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
 1 Australia NA          NA    17065    0.220 16774   16591   1300       1224
 2 Australia 1991-01-01  12.1  17284    0.223 17171   16774   1379       1300
 3 Australia 1992-01-01  12.4  17495    0.226 17914   17171   1455       1379
 4 Australia 1993-01-01  12.5  17667    0.228 18883   17914   1540       1455
 5 Australia 1994-01-01  10.2  17855    0.231 19849   18883   1626       1540
 6 Australia 1995-01-01  10.2  18072    0.233 21079   19849   1737       1626
 7 Australia 1996-01-01  10.6  18311    0.237 21923   21079   1846       1737
 8 Australia 1997-01-01  10.3  18518    0.239 22961   21923   1948       1846
 9 Australia 1998-01-01  10.5  18711    0.242 24148   22961   2077       1948
10 Australia 1999-01-01   8.67 18926    0.244 25445   24148   2231       2077
# ℹ 46 more rows
# ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
#   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
Code
my_countries <- c("Australia", "Canada", "United States", "Ireland")

organdata |>
  filter(!(country %in% my_countries)) 
# A tibble: 182 × 21
   country year       donors   pop pop_dens   gdp gdp_lag health health_lag
   <chr>   <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
 1 Austria NA           NA    7678     9.16 18914   17425   1344       1255
 2 Austria 1991-01-01   27.6  7755     9.25 19860   18914   1419       1344
 3 Austria 1992-01-01   23.1  7841     9.35 20601   19860   1551       1419
 4 Austria 1993-01-01   26.2  7906     9.43 21119   20601   1674       1551
 5 Austria 1994-01-01   21.4  7936     9.46 21940   21119   1739       1674
 6 Austria 1995-01-01   21.5  7948     9.48 22817   21940   1865       1739
 7 Austria 1996-01-01   24.7  7959     9.49 23798   22817   1986       1865
 8 Austria 1997-01-01   19.5  7968     9.50 24364   23798   1848       1986
 9 Austria 1998-01-01   20.7  7977     9.51 25423   24364   1953       1848
10 Austria 1999-01-01   25.9  7992     9.53 26513   25423   2069       1953
# ℹ 172 more rows
# ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
#   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
Code
`%nin%` <- Negate(`%in%`) # this operator is included in the socviz package
Code
organdata |>
  filter(country %nin% my_countries) 
# A tibble: 182 × 21
   country year       donors   pop pop_dens   gdp gdp_lag health health_lag
   <chr>   <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
 1 Austria NA           NA    7678     9.16 18914   17425   1344       1255
 2 Austria 1991-01-01   27.6  7755     9.25 19860   18914   1419       1344
 3 Austria 1992-01-01   23.1  7841     9.35 20601   19860   1551       1419
 4 Austria 1993-01-01   26.2  7906     9.43 21119   20601   1674       1551
 5 Austria 1994-01-01   21.4  7936     9.46 21940   21119   1739       1674
 6 Austria 1995-01-01   21.5  7948     9.48 22817   21940   1865       1739
 7 Austria 1996-01-01   24.7  7959     9.49 23798   22817   1986       1865
 8 Austria 1997-01-01   19.5  7968     9.50 24364   23798   1848       1986
 9 Austria 1998-01-01   20.7  7977     9.51 25423   24364   1953       1848
10 Austria 1999-01-01   25.9  7992     9.53 26513   25423   2069       1953
# ℹ 172 more rows
# ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
#   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
Code
gss_sm |>
  group_by(race, sex, degree) |>
  summarize(n = n(),
            mean_age = mean(age, na.rm = TRUE),
            mean_kids = mean(childs, na.rm = TRUE))
# A tibble: 34 × 6
# Groups:   race, sex [6]
   race  sex    degree             n mean_age mean_kids
   <fct> <fct>  <fct>          <int>    <dbl>     <dbl>
 1 White Male   Lt High School    96     52.9      2.45
 2 White Male   High School      470     48.8      1.61
 3 White Male   Junior College    65     47.1      1.54
 4 White Male   Bachelor         208     48.6      1.35
 5 White Male   Graduate         112     56.0      1.71
 6 White Female Lt High School   101     55.4      2.81
 7 White Female High School      587     51.9      1.98
 8 White Female Junior College   101     48.2      1.91
 9 White Female Bachelor         218     49.2      1.44
10 White Female Graduate         138     53.6      1.38
# ℹ 24 more rows

Using across()

Starting to get repetitive. This is a warning sign:

Code
organdata |>
  group_by(consent_law, country) |>
  summarize(donors_mean = mean(donors, na.rm = TRUE),
            donors_sd = sd(donors, na.rm = TRUE),
            gdp_mean = mean(gdp, na.rm = TRUE),
            health_mean = mean(health, na.rm = TRUE),
            roads_mean = mean(roads, na.rm = TRUE))
# A tibble: 17 × 7
# Groups:   consent_law [2]
   consent_law country     donors_mean donors_sd gdp_mean health_mean roads_mean
   <chr>       <chr>             <dbl>     <dbl>    <dbl>       <dbl>      <dbl>
 1 Informed    Australia          10.6     1.14    22179.       1958.      105. 
 2 Informed    Canada             14.0     0.751   23711.       2272.      109. 
 3 Informed    Denmark            13.1     1.47    23722.       2054.      102. 
 4 Informed    Germany            13.0     0.611   22163.       2349.      113. 
 5 Informed    Ireland            19.8     2.48    20824.       1480.      118. 
 6 Informed    Netherlands        13.7     1.55    23013.       1993.       76.1
 7 Informed    United Kin…        13.5     0.775   21359.       1561.       67.9
 8 Informed    United Sta…        20.0     1.33    29212.       3988.      155. 
 9 Presumed    Austria            23.5     2.42    23876.       1875.      150. 
10 Presumed    Belgium            21.9     1.94    22500.       1958.      155. 
11 Presumed    Finland            18.4     1.53    21019.       1615.       93.6
12 Presumed    France             16.8     1.60    22603.       2160.      156. 
13 Presumed    Italy              11.1     4.28    21554.       1757       122. 
14 Presumed    Norway             15.4     1.11    26448.       2217.       70.0
15 Presumed    Spain              28.1     4.96    16933        1289.      161. 
16 Presumed    Sweden             13.1     1.75    22415.       1951.       72.3
17 Presumed    Switzerland        14.2     1.71    27233        2776.       96.4

Better:

Code
my_vars <- c("gdp", "donors", "roads")

## nested parens again, but it's worth it
organdata |>
  group_by(consent_law, country) |>
  ## Tidyselect requires all_of() to 
  ## make the selection explicit
  summarize(across(all_of(my_vars),
                   list(avg = mean),
                   na.rm = TRUE))
Warning: There was 1 warning in `summarize()`.
ℹ In argument: `across(all_of(my_vars), list(avg = mean), na.rm = TRUE)`.
ℹ In group 1: `consent_law = "Informed"` and `country = "Australia"`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))
# A tibble: 17 × 5
# Groups:   consent_law [2]
   consent_law country        gdp_avg donors_avg roads_avg
   <chr>       <chr>            <dbl>      <dbl>     <dbl>
 1 Informed    Australia       22179.       10.6     105. 
 2 Informed    Canada          23711.       14.0     109. 
 3 Informed    Denmark         23722.       13.1     102. 
 4 Informed    Germany         22163.       13.0     113. 
 5 Informed    Ireland         20824.       19.8     118. 
 6 Informed    Netherlands     23013.       13.7      76.1
 7 Informed    United Kingdom  21359.       13.5      67.9
 8 Informed    United States   29212.       20.0     155. 
 9 Presumed    Austria         23876.       23.5     150. 
10 Presumed    Belgium         22500.       21.9     155. 
11 Presumed    Finland         21019.       18.4      93.6
12 Presumed    France          22603.       16.8     156. 
13 Presumed    Italy           21554.       11.1     122. 
14 Presumed    Norway          26448.       15.4      70.0
15 Presumed    Spain           16933        28.1     161. 
16 Presumed    Sweden          22415.       13.1      72.3
17 Presumed    Switzerland     27233        14.2      96.4
Code
my_vars <- c("gdp", "donors", "roads")

organdata |>
  group_by(consent_law, country) |>
  summarize(across(my_vars,
                   list(avg = mean, 
                        sd = var, 
                        md = median),
                   na.rm = TRUE))
Warning: There was 1 warning in `summarize()`.
ℹ In argument: `across(my_vars, list(avg = mean, sd = var, md = median), na.rm
  = TRUE)`.
ℹ In group 1: `consent_law = "Informed"` and `country = "Australia"`.
Caused by warning:
! Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(my_vars)

  # Now:
  data %>% select(all_of(my_vars))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# A tibble: 17 × 11
# Groups:   consent_law [2]
   consent_law country      gdp_avg gdp_sd gdp_md donors_avg donors_sd donors_md
   <chr>       <chr>          <dbl>  <dbl>  <int>      <dbl>     <dbl>     <dbl>
 1 Informed    Australia     22179. 1.57e7  21923       10.6     1.31       10.4
 2 Informed    Canada        23711. 1.57e7  22764       14.0     0.564      14.0
 3 Informed    Denmark       23722. 1.52e7  23548       13.1     2.16       12.9
 4 Informed    Germany       22163. 6.26e6  22164       13.0     0.374      13  
 5 Informed    Ireland       20824. 4.45e7  19245       19.8     6.14       19.2
 6 Informed    Netherlands   23013. 1.42e7  22541       13.7     2.41       13.8
 7 Informed    United King…  21359. 1.54e7  20839       13.5     0.601      13.5
 8 Informed    United Stat…  29212. 2.09e7  28772       20.0     1.76       20.1
 9 Presumed    Austria       23876. 1.12e7  23798       23.5     5.84       23.8
10 Presumed    Belgium       22500. 1.01e7  22152       21.9     3.75       21.4
11 Presumed    Finland       21019. 1.35e7  19842       18.4     2.33       19.4
12 Presumed    France        22603. 1.06e7  21990       16.8     2.55       16.6
13 Presumed    Italy         21554. 7.74e6  21396       11.1    18.3        11.3
14 Presumed    Norway        26448. 4.21e7  26218       15.4     1.23       15.4
15 Presumed    Spain         16933  8.34e6  16416       28.1    24.6        28  
16 Presumed    Sweden        22415. 1.03e7  22029       13.1     3.07       12.7
17 Presumed    Switzerland   27233  4.64e6  26304       14.2     2.92       14.4
# ℹ 3 more variables: roads_avg <dbl>, roads_sd <dbl>, roads_md <dbl>
Code
my_vars <- c("gdp", "donors", "roads")

organdata |>
  group_by(consent_law, country) |>
  summarize(across(my_vars,
                   list(mean = mean, 
                        var = var, 
                        median = median),
                   na.rm = TRUE))
# A tibble: 17 × 11
# Groups:   consent_law [2]
   consent_law country        gdp_mean gdp_var gdp_median donors_mean donors_var
   <chr>       <chr>             <dbl>   <dbl>      <int>       <dbl>      <dbl>
 1 Informed    Australia        22179.  1.57e7      21923        10.6      1.31 
 2 Informed    Canada           23711.  1.57e7      22764        14.0      0.564
 3 Informed    Denmark          23722.  1.52e7      23548        13.1      2.16 
 4 Informed    Germany          22163.  6.26e6      22164        13.0      0.374
 5 Informed    Ireland          20824.  4.45e7      19245        19.8      6.14 
 6 Informed    Netherlands      23013.  1.42e7      22541        13.7      2.41 
 7 Informed    United Kingdom   21359.  1.54e7      20839        13.5      0.601
 8 Informed    United States    29212.  2.09e7      28772        20.0      1.76 
 9 Presumed    Austria          23876.  1.12e7      23798        23.5      5.84 
10 Presumed    Belgium          22500.  1.01e7      22152        21.9      3.75 
11 Presumed    Finland          21019.  1.35e7      19842        18.4      2.33 
12 Presumed    France           22603.  1.06e7      21990        16.8      2.55 
13 Presumed    Italy            21554.  7.74e6      21396        11.1     18.3  
14 Presumed    Norway           26448.  4.21e7      26218        15.4      1.23 
15 Presumed    Spain            16933   8.34e6      16416        28.1     24.6  
16 Presumed    Sweden           22415.  1.03e7      22029        13.1      3.07 
17 Presumed    Switzerland      27233   4.64e6      26304        14.2      2.92 
# ℹ 4 more variables: donors_median <dbl>, roads_mean <dbl>, roads_var <dbl>,
#   roads_median <dbl>
Code
organdata |>
  group_by(consent_law, country) |>
  summarize(across(where(is.numeric), 
                   list(mean = mean,
                        var = var,
                        median = median),
                   na.rm = TRUE)) |>
    print(n = 3) # just to save space here
# A tibble: 17 × 41
# Groups:   consent_law [2]
  consent_law country   donors_mean donors_var donors_median pop_mean  pop_var
  <chr>       <chr>           <dbl>      <dbl>         <dbl>    <dbl>    <dbl>
1 Informed    Australia        10.6      1.31           10.4   18318.  690385.
2 Informed    Canada           14.0      0.564          14.0   29608. 1422648.
3 Informed    Denmark          13.1      2.16           12.9    5257.    6497.
# ℹ 14 more rows
# ℹ 34 more variables: pop_median <int>, pop_dens_mean <dbl>,
#   pop_dens_var <dbl>, pop_dens_median <dbl>, gdp_mean <dbl>, gdp_var <dbl>,
#   gdp_median <int>, gdp_lag_mean <dbl>, gdp_lag_var <dbl>,
#   gdp_lag_median <dbl>, health_mean <dbl>, health_var <dbl>,
#   health_median <dbl>, health_lag_mean <dbl>, health_lag_var <dbl>,
#   health_lag_median <dbl>, pubhealth_mean <dbl>, pubhealth_var <dbl>, …

across() is flexible:

Code
organdata |>
  group_by(consent_law, country) |>
  summarize(across(where(is.numeric),
                   list(mean = mean,
                        var = var,
                        median = median),
                   na.rm = TRUE,
                   .names = "{fn}_{col}")) |> 
  print(n = 3)
# A tibble: 17 × 41
# Groups:   consent_law [2]
  consent_law country   mean_donors var_donors median_donors mean_pop  var_pop
  <chr>       <chr>           <dbl>      <dbl>         <dbl>    <dbl>    <dbl>
1 Informed    Australia        10.6      1.31           10.4   18318.  690385.
2 Informed    Canada           14.0      0.564          14.0   29608. 1422648.
3 Informed    Denmark          13.1      2.16           12.9    5257.    6497.
# ℹ 14 more rows
# ℹ 34 more variables: median_pop <int>, mean_pop_dens <dbl>,
#   var_pop_dens <dbl>, median_pop_dens <dbl>, mean_gdp <dbl>, var_gdp <dbl>,
#   median_gdp <int>, mean_gdp_lag <dbl>, var_gdp_lag <dbl>,
#   median_gdp_lag <dbl>, mean_health <dbl>, var_health <dbl>,
#   median_health <dbl>, mean_health_lag <dbl>, var_health_lag <dbl>,
#   median_health_lag <dbl>, mean_pubhealth <dbl>, var_pubhealth <dbl>, …

Tidy selectors

Code
organdata |>
  mutate(across(where(is.character), toupper)) |>
  select(where(is.character))
# A tibble: 238 × 7
   country   world   opt   consent_law consent_practice consistent ccode
   <chr>     <chr>   <chr> <chr>       <chr>            <chr>      <chr>
 1 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
 2 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
 3 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
 4 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
 5 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
 6 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
 7 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
 8 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
 9 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
10 AUSTRALIA LIBERAL IN    INFORMED    INFORMED         YES        OZ   
# ℹ 228 more rows
Code
organdata |>
  group_by(consent_law, country) |>
  summarize(donors = mean(donors, na.rm = TRUE)) |>
  arrange(donors) |> 
  print(n = 5)
# A tibble: 17 × 3
# Groups:   consent_law [2]
  consent_law country   donors
  <chr>       <chr>      <dbl>
1 Informed    Australia   10.6
2 Presumed    Italy       11.1
3 Informed    Germany     13.0
4 Informed    Denmark     13.1
5 Presumed    Sweden      13.1
# ℹ 12 more rows
Code
organdata |>
  group_by(consent_law, country) |>
  summarize(donors = mean(donors, na.rm = TRUE)) |>
  arrange(donors) |> 
  print(n = 5)
# A tibble: 17 × 3
# Groups:   consent_law [2]
  consent_law country   donors
  <chr>       <chr>      <dbl>
1 Informed    Australia   10.6
2 Presumed    Italy       11.1
3 Informed    Germany     13.0
4 Informed    Denmark     13.1
5 Presumed    Sweden      13.1
# ℹ 12 more rows
Code
organdata |>
  group_by(consent_law, country) |>
  summarize(donors = mean(donors, na.rm = TRUE)) |>
  arrange(desc(donors)) |> 
  print(n = 5)
# A tibble: 17 × 3
# Groups:   consent_law [2]
  consent_law country       donors
  <chr>       <chr>          <dbl>
1 Presumed    Spain           28.1
2 Presumed    Austria         23.5
3 Presumed    Belgium         21.9
4 Informed    United States   20.0
5 Informed    Ireland         19.8
# ℹ 12 more rows

slice_max et al

Code
organdata |>
  group_by(consent_law, country) |>
  summarize(donors = mean(donors, na.rm = TRUE)) |>
  slice_max(donors, n = 5) 
# A tibble: 10 × 3
# Groups:   consent_law [2]
   consent_law country        donors
   <chr>       <chr>           <dbl>
 1 Informed    United States    20.0
 2 Informed    Ireland          19.8
 3 Informed    Canada           14.0
 4 Informed    Netherlands      13.7
 5 Informed    United Kingdom   13.5
 6 Presumed    Spain            28.1
 7 Presumed    Austria          23.5
 8 Presumed    Belgium          21.9
 9 Presumed    Finland          18.4
10 Presumed    France           16.8

Window functions

Code
## Data on COVID-19
library(covdata)

Attaching package: 'covdata'
The following object is masked _by_ '.GlobalEnv':

    %nin%
The following object is masked from 'package:socviz':

    %nin%
The following object is masked from 'package:datasets':

    uspop
Code
covnat_weekly 
# A tibble: 4,966 × 11
   date       year_week cname   iso3      pop cases deaths cu_cases cu_deaths
   <date>     <chr>     <chr>   <chr>   <dbl> <dbl>  <dbl>    <dbl>     <dbl>
 1 2019-12-30 2020-01   Austria AUT   8932664    NA     NA       NA        NA
 2 2020-01-06 2020-02   Austria AUT   8932664    NA     NA       NA        NA
 3 2020-01-13 2020-03   Austria AUT   8932664    NA     NA       NA        NA
 4 2020-01-20 2020-04   Austria AUT   8932664    NA     NA       NA        NA
 5 2020-01-27 2020-05   Austria AUT   8932664    NA     NA       NA        NA
 6 2020-02-03 2020-06   Austria AUT   8932664    NA     NA       NA        NA
 7 2020-02-10 2020-07   Austria AUT   8932664    NA     NA       NA        NA
 8 2020-02-17 2020-08   Austria AUT   8932664    NA     NA       NA        NA
 9 2020-02-24 2020-09   Austria AUT   8932664    12      0       12         0
10 2020-03-02 2020-10   Austria AUT   8932664   115      0      127         0
# ℹ 4,956 more rows
# ℹ 2 more variables: r14_cases <dbl>, r14_deaths <dbl>
Code
covnat_weekly |> 
  filter(iso3 == "FRA") |> 
  select(date, cname, iso3, cases) |> 
  mutate(cases = ifelse(is.na(cases), 0, cases), # convert NA vals in `cases` to 0
         cumulative = cumsum(cases)) 
# A tibble: 159 × 5
   date       cname  iso3  cases cumulative
   <date>     <chr>  <chr> <dbl>      <dbl>
 1 2019-12-30 France FRA       0          0
 2 2020-01-06 France FRA       0          0
 3 2020-01-13 France FRA       0          0
 4 2020-01-20 France FRA       3          3
 5 2020-01-27 France FRA       3          6
 6 2020-02-03 France FRA       6         12
 7 2020-02-10 France FRA       0         12
 8 2020-02-17 France FRA       4         16
 9 2020-02-24 France FRA     133        149
10 2020-03-02 France FRA     981       1130
# ℹ 149 more rows
Code
covnat_weekly |> 
  select(date, cname, iso3, deaths) |> 
  filter(iso3 == "FRA") |> 
  filter(cume_dist(desc(deaths)) < 0.1) # i.e. Top 10%
# A tibble: 15 × 4
   date       cname  iso3  deaths
   <date>     <chr>  <chr>  <dbl>
 1 2020-04-06 France FRA     3348
 2 2020-10-26 France FRA     3517
 3 2020-11-02 France FRA     5281
 4 2020-11-09 France FRA     6018
 5 2020-11-16 France FRA     6208
 6 2020-11-23 France FRA     5215
 7 2020-11-30 France FRA     4450
 8 2020-12-07 France FRA     4257
 9 2020-12-14 France FRA     3786
10 2020-12-21 France FRA     3560
11 2021-01-04 France FRA     3851
12 2021-01-11 France FRA     3833
13 2021-01-18 France FRA     3754
14 2021-01-25 France FRA     3535
15 2021-02-01 France FRA     3431
Code
covus |> 
  filter(measure == "death") |> 
  group_by(state) |> 
  arrange(state, desc(date)) |> 
  filter(state %in% "NY")
# A tibble: 371 × 7
# Groups:   state [1]
   date       state fips  data_quality_grade measure count measure_label
   <date>     <chr> <chr> <lgl>              <chr>   <dbl> <chr>        
 1 2021-03-07 NY    36    NA                 death   39029 Deaths       
 2 2021-03-06 NY    36    NA                 death   38970 Deaths       
 3 2021-03-05 NY    36    NA                 death   38891 Deaths       
 4 2021-03-04 NY    36    NA                 death   38796 Deaths       
 5 2021-03-03 NY    36    NA                 death   38735 Deaths       
 6 2021-03-02 NY    36    NA                 death   38660 Deaths       
 7 2021-03-01 NY    36    NA                 death   38577 Deaths       
 8 2021-02-28 NY    36    NA                 death   38497 Deaths       
 9 2021-02-27 NY    36    NA                 death   38407 Deaths       
10 2021-02-26 NY    36    NA                 death   38321 Deaths       
# ℹ 361 more rows

Lead and Lag

Code
my_vec <- c(1:20)
my_vec
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
Code
lag(my_vec) # first element has no lag
 [1] NA  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
Code
my_vec - lag(my_vec)
 [1] NA  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
Code
covus |>
  select(-data_quality_grade) |> 
  filter(measure == "death") |>
  group_by(state) |>
  arrange(date) |> 
  mutate(deaths_daily = count - lag(count, order_by = date)) |> 
  arrange(state, desc(date)) |> 
  filter(state %in% "NY")
# A tibble: 371 × 7
# Groups:   state [1]
   date       state fips  measure count measure_label deaths_daily
   <date>     <chr> <chr> <chr>   <dbl> <chr>                <dbl>
 1 2021-03-07 NY    36    death   39029 Deaths                  59
 2 2021-03-06 NY    36    death   38970 Deaths                  79
 3 2021-03-05 NY    36    death   38891 Deaths                  95
 4 2021-03-04 NY    36    death   38796 Deaths                  61
 5 2021-03-03 NY    36    death   38735 Deaths                  75
 6 2021-03-02 NY    36    death   38660 Deaths                  83
 7 2021-03-01 NY    36    death   38577 Deaths                  80
 8 2021-02-28 NY    36    death   38497 Deaths                  90
 9 2021-02-27 NY    36    death   38407 Deaths                  86
10 2021-02-26 NY    36    death   38321 Deaths                  94
# ℹ 361 more rows

Writing your own basic function

Code
my_fun <- function(x) {
  x + 1
}

my_fun # we've created the function; it's just an object
function(x) {
  x + 1
}
Code
my_fun(x = 1) # But we can supply it with an input!
[1] 2
Code
my_fun(10)
[1] 11
Code
get_daily_count <- function(count, date){
  count - lag(count, order_by = date)
}
Code
covus |>
  filter(measure == "death") |>
  select(-data_quality_grade) |> 
  group_by(state) |>
  arrange(date) |> 
  mutate(deaths_daily = get_daily_count(count, date)) |> 
  arrange(state, desc(date)) |> 
  filter(state %in% "NY")
# A tibble: 371 × 7
# Groups:   state [1]
   date       state fips  measure count measure_label deaths_daily
   <date>     <chr> <chr> <chr>   <dbl> <chr>                <dbl>
 1 2021-03-07 NY    36    death   39029 Deaths                  59
 2 2021-03-06 NY    36    death   38970 Deaths                  79
 3 2021-03-05 NY    36    death   38891 Deaths                  95
 4 2021-03-04 NY    36    death   38796 Deaths                  61
 5 2021-03-03 NY    36    death   38735 Deaths                  75
 6 2021-03-02 NY    36    death   38660 Deaths                  83
 7 2021-03-01 NY    36    death   38577 Deaths                  80
 8 2021-02-28 NY    36    death   38497 Deaths                  90
 9 2021-02-27 NY    36    death   38407 Deaths                  86
10 2021-02-26 NY    36    death   38321 Deaths                  94
# ℹ 361 more rows

Moving averages

Code
# install.packages("slider")
library(slider)
Code
covus |>
  filter(measure == "death") |>
  select(-data_quality_grade) |> 
  group_by(state) |>
  arrange(date) |> 
  mutate(
    deaths_daily = get_daily_count(count, date), 
    deaths7 = slide_mean(deaths_daily, 
                         before = 7, 
                         na_rm = TRUE)) |> 
  arrange(state, desc(date)) |> 
  filter(state %in% "NY")
# A tibble: 371 × 8
# Groups:   state [1]
   date       state fips  measure count measure_label deaths_daily deaths7
   <date>     <chr> <chr> <chr>   <dbl> <chr>                <dbl>   <dbl>
 1 2021-03-07 NY    36    death   39029 Deaths                  59    77.8
 2 2021-03-06 NY    36    death   38970 Deaths                  79    81.1
 3 2021-03-05 NY    36    death   38891 Deaths                  95    83  
 4 2021-03-04 NY    36    death   38796 Deaths                  61    82.6
 5 2021-03-03 NY    36    death   38735 Deaths                  75    88  
 6 2021-03-02 NY    36    death   38660 Deaths                  83    89.9
 7 2021-03-01 NY    36    death   38577 Deaths                  80    90.8
 8 2021-02-28 NY    36    death   38497 Deaths                  90    90.1
 9 2021-02-27 NY    36    death   38407 Deaths                  86    91.5
10 2021-02-26 NY    36    death   38321 Deaths                  94    95.6
# ℹ 361 more rows

Functions for tidying up columns

Code
gss_sm
# A tibble: 2,867 × 32
    year    id ballot       age childs sibs   degree race  sex   region income16
   <dbl> <dbl> <labelled> <dbl>  <dbl> <labe> <fct>  <fct> <fct> <fct>  <fct>   
 1  2016     1 1             47      3 2      Bache… White Male  New E… $170000…
 2  2016     2 2             61      0 3      High … White Male  New E… $50000 …
 3  2016     3 3             72      2 3      Bache… White Male  New E… $75000 …
 4  2016     4 1             43      4 3      High … White Fema… New E… $170000…
 5  2016     5 3             55      2 2      Gradu… White Fema… New E… $170000…
 6  2016     6 2             53      2 2      Junio… White Fema… New E… $60000 …
 7  2016     7 1             50      2 2      High … White Male  New E… $170000…
 8  2016     8 3             23      3 6      High … Other Fema… Middl… $30000 …
 9  2016     9 1             45      3 5      High … Black Male  Middl… $60000 …
10  2016    10 3             71      4 1      Junio… White Male  Middl… $60000 …
# ℹ 2,857 more rows
# ℹ 21 more variables: relig <fct>, marital <fct>, padeg <fct>, madeg <fct>,
#   partyid <fct>, polviews <fct>, happy <fct>, partners <fct>, grass <fct>,
#   zodiac <fct>, pres12 <labelled>, wtssall <dbl>, income_rc <fct>,
#   agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>, religion <fct>,
#   bigregion <fct>, partners_rc <fct>, obama <dbl>

Data to practice on

Code
library(ukelection2019)

ukvote2019
# A tibble: 3,320 × 13
   cid     constituency electorate party_name candidate votes vote_share_percent
   <chr>   <chr>             <int> <chr>      <chr>     <int>              <dbl>
 1 W07000… Aberavon          50747 Labour     Stephen … 17008               53.8
 2 W07000… Aberavon          50747 Conservat… Charlott…  6518               20.6
 3 W07000… Aberavon          50747 The Brexi… Glenda D…  3108                9.8
 4 W07000… Aberavon          50747 Plaid Cym… Nigel Hu…  2711                8.6
 5 W07000… Aberavon          50747 Liberal D… Sheila K…  1072                3.4
 6 W07000… Aberavon          50747 Independe… Captain …   731                2.3
 7 W07000… Aberavon          50747 Green      Giorgia …   450                1.4
 8 W07000… Aberconwy         44699 Conservat… Robin Mi… 14687               46.1
 9 W07000… Aberconwy         44699 Labour     Emily Ow… 12653               39.7
10 W07000… Aberconwy         44699 Plaid Cym… Lisa Goo…  2704                8.5
# ℹ 3,310 more rows
# ℹ 6 more variables: vote_share_change <dbl>, total_votes_cast <int>,
#   vrank <int>, turnout <dbl>, fname <chr>, lname <chr>
Code
library(ukelection2019)

ukvote2019 |> 
  sample_n(10)
# A tibble: 10 × 13
   cid     constituency electorate party_name candidate votes vote_share_percent
   <chr>   <chr>             <int> <chr>      <chr>     <int>              <dbl>
 1 E14000… Poole             73992 Labour     Sue Aitk… 10483               20.8
 2 E14000… Mitcham & M…      70014 Green      Pippa Ma…  1160                2.5
 3 E14000… Folkestone …      88273 Conservat… Damian C… 35483               60.1
 4 E14000… Bristol Sou…      84079 Green      Tony Dyer  2713                4.9
 5 E14000… Great Yarmo…      71957 Liberal D… James Jo…  1661                3.8
 6 E14000… Rutland & M…      82711 Liberal D… Carol We…  7970               13.7
 7 E14000… St Austell …      79930 Labour     Felicity… 14747               26.4
 8 E14000… Bedfordshir…      90679 Liberal D… Daniel N…  7999               12.3
 9 N06000… South Down        79175 Alliance … Patrick …  6916               13.9
10 E14000… Bath              67725 Liberal D… Wera Hob… 28419               54.5
# ℹ 6 more variables: vote_share_change <dbl>, total_votes_cast <int>,
#   vrank <int>, turnout <dbl>, fname <chr>, lname <chr>

Using .by instead of group_by()

Code
gss_sm |> 
  group_by(bigregion, religion) |> 
  summarize(total = n())
# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion   total
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# ℹ 14 more rows
Code
gss_sm |> 
  group_by(bigregion, religion) |> 
  tally()
# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# ℹ 14 more rows
Code
gss_sm |> 
  count(bigregion, religion) 
# A tibble: 24 × 3
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# ℹ 14 more rows

Doing things this way can be less confusing.

Code
gss_sm |> 
  summarize(total = n(), .by = c(bigregion, religion))
# A tibble: 24 × 3
   bigregion religion   total
   <fct>     <fct>      <int>
 1 Northeast None         112
 2 Northeast Catholic     162
 3 Northeast Protestant   158
 4 Northeast Other         28
 5 Northeast Jewish        27
 6 West      Jewish        10
 7 West      None         180
 8 West      Other         48
 9 West      Protestant   238
10 West      Catholic     155
# ℹ 14 more rows

By default when using .by = in summarize, you always get an ungrouped tibble back.

Code
gss_sm |> 
  summarize(total = n(), .by = c(bigregion, religion))
# A tibble: 24 × 3
   bigregion religion   total
   <fct>     <fct>      <int>
 1 Northeast None         112
 2 Northeast Catholic     162
 3 Northeast Protestant   158
 4 Northeast Other         28
 5 Northeast Jewish        27
 6 West      Jewish        10
 7 West      None         180
 8 West      Other         48
 9 West      Protestant   238
10 West      Catholic     155
# ℹ 14 more rows

See help(dplyr::summarize) for more on these options, including the .groups argument. (You can’t use .groups and .by together.)

Comparisons on proportions

Code
df
# A tibble: 4 × 3
  id    prop1 prop2
  <chr> <dbl> <dbl>
1 A      0.1   0.2 
2 B      0.1   0.21
3 C      0.11  0.2 
4 D      0.1   0.1 
Code
df |> 
  filter(prop1 + prop2 > 0.3)
# A tibble: 3 × 3
  id    prop1 prop2
  <chr> <dbl> <dbl>
1 A      0.1   0.2 
2 B      0.1   0.21
3 C      0.11  0.2 
Code
df |> 
  filter(prop1 + prop2 == 0.3)
# A tibble: 0 × 3
# ℹ 3 variables: id <chr>, prop1 <dbl>, prop2 <dbl>
Code
df |> 
  mutate(prop3 = prop1 + prop2) |> 
  filter(prop3 == 0.3)
# A tibble: 0 × 4
# ℹ 4 variables: id <chr>, prop1 <dbl>, prop2 <dbl>, prop3 <dbl>
Code
df |> 
  filter(prop1*100 + prop2*100 == 0.3*100)
# A tibble: 1 × 3
  id    prop1 prop2
  <chr> <dbl> <dbl>
1 A       0.1   0.2
Code
df |> 
  filter(near(prop1 + prop2, 0.3))
# A tibble: 1 × 3
  id    prop1 prop2
  <chr> <dbl> <dbl>
1 A       0.1   0.2

Zero counts in dplyr

Code
df <- read_csv(here("files", "data", "first_terms.csv"))
Rows: 280 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): party, sex
dbl  (1): pid
date (1): start_year

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
df
# A tibble: 280 × 4
     pid start_year party      sex  
   <dbl> <date>     <chr>      <chr>
 1  3160 2013-01-03 Republican M    
 2  3161 2013-01-03 Democrat   F    
 3  3162 2013-01-03 Democrat   M    
 4  3163 2013-01-03 Republican M    
 5  3164 2013-01-03 Democrat   M    
 6  3165 2013-01-03 Republican M    
 7  3166 2013-01-03 Republican M    
 8  3167 2013-01-03 Democrat   F    
 9  3168 2013-01-03 Republican M    
10  3169 2013-01-03 Democrat   M    
# ℹ 270 more rows
Code
df |>
    group_by(start_year, party, sex) |>
    summarize(N = n()) |>
    mutate(freq = N / sum(N))
# A tibble: 14 × 5
# Groups:   start_year, party [8]
   start_year party      sex       N   freq
   <date>     <chr>      <chr> <int>  <dbl>
 1 2013-01-03 Democrat   F        21 0.362 
 2 2013-01-03 Democrat   M        37 0.638 
 3 2013-01-03 Republican F         8 0.101 
 4 2013-01-03 Republican M        71 0.899 
 5 2015-01-03 Democrat   M         1 1     
 6 2015-01-03 Republican M         5 1     
 7 2017-01-03 Democrat   F         6 0.24  
 8 2017-01-03 Democrat   M        19 0.76  
 9 2017-01-03 Republican F         2 0.0667
10 2017-01-03 Republican M        28 0.933 
11 2019-01-03 Democrat   F        33 0.647 
12 2019-01-03 Democrat   M        18 0.353 
13 2019-01-03 Republican F         1 0.0323
14 2019-01-03 Republican M        30 0.968 
Code
p_col <- df |>
    group_by(start_year, party, sex) |>
    summarize(N = n()) |>
    mutate(freq = N / sum(N)) |>
    ggplot(aes(x = start_year,
               y = freq,
               fill = sex)) +
    geom_col() +
    scale_y_continuous(labels = scales::percent) +
    scale_fill_manual(values = sex_colors, labels = c("Women", "Men")) +
    labs(x = "Year", y = "Percent", fill = "Group") +
    facet_wrap(~ party)
Code
p_col

Code
p_line <- df |>
    group_by(start_year, party, sex) |>
    summarize(N = n()) |>
    mutate(freq = N / sum(N)) |>
    ggplot(aes(x = start_year,
               y = freq,
               color = sex)) +
    geom_line(size = 1.1) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_manual(values = sex_colors, labels = c("Women", "Men")) +
    guides(color = guide_legend(reverse = TRUE)) +
    labs(x = "Year", y = "Percent", color = "Group") +
    facet_wrap(~ party)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Code
p_line

Code
df_f <- df |> 
  mutate(party_f = factor(party))

df_f
# A tibble: 280 × 5
     pid start_year party      sex   party_f   
   <dbl> <date>     <chr>      <chr> <fct>     
 1  3160 2013-01-03 Republican M     Republican
 2  3161 2013-01-03 Democrat   F     Democrat  
 3  3162 2013-01-03 Democrat   M     Democrat  
 4  3163 2013-01-03 Republican M     Republican
 5  3164 2013-01-03 Democrat   M     Democrat  
 6  3165 2013-01-03 Republican M     Republican
 7  3166 2013-01-03 Republican M     Republican
 8  3167 2013-01-03 Democrat   F     Democrat  
 9  3168 2013-01-03 Republican M     Republican
10  3169 2013-01-03 Democrat   M     Democrat  
# ℹ 270 more rows
Code
df_f |> 
  group_by(party_f) |> 
  tally()
# A tibble: 2 × 2
  party_f        n
  <fct>      <int>
1 Democrat     135
2 Republican   145
Code
typeof(df_f$party_f)
[1] "integer"
Code
levels(df_f$party_f)
[1] "Democrat"   "Republican"
Code
df_f <- df |> 
  mutate(party_f = factor(party, 
                          levels = c("Democrat", 
                                     "Republican", 
                                     "Libertarian")))
df_f |> 
  group_by(party_f) |> 
  tally()
# A tibble: 2 × 2
  party_f        n
  <fct>      <int>
1 Democrat     135
2 Republican   145
Code
levels(df_f$party_f)
[1] "Democrat"    "Republican"  "Libertarian"
Code
df |> 
  mutate(across(where(is.character), as_factor)) |> 
  group_by(start_year, party, sex) |>
  summarize(N = n()) |>
  mutate(freq = N / sum(N))
# A tibble: 14 × 5
# Groups:   start_year, party [8]
   start_year party      sex       N   freq
   <date>     <fct>      <fct> <int>  <dbl>
 1 2013-01-03 Republican M        71 0.899 
 2 2013-01-03 Republican F         8 0.101 
 3 2013-01-03 Democrat   M        37 0.638 
 4 2013-01-03 Democrat   F        21 0.362 
 5 2015-01-03 Republican M         5 1     
 6 2015-01-03 Democrat   M         1 1     
 7 2017-01-03 Republican M        28 0.933 
 8 2017-01-03 Republican F         2 0.0667
 9 2017-01-03 Democrat   M        19 0.76  
10 2017-01-03 Democrat   F         6 0.24  
11 2019-01-03 Republican M        30 0.968 
12 2019-01-03 Republican F         1 0.0323
13 2019-01-03 Democrat   M        18 0.353 
14 2019-01-03 Democrat   F        33 0.647 
Code
df |> 
  mutate(across(where(is.character), as_factor)) |> 
  group_by(start_year, party, sex, .drop = FALSE) |> 
  summarize(N = n()) |>
  mutate(freq = N / sum(N))
# A tibble: 16 × 5
# Groups:   start_year, party [8]
   start_year party      sex       N   freq
   <date>     <fct>      <fct> <int>  <dbl>
 1 2013-01-03 Republican M        71 0.899 
 2 2013-01-03 Republican F         8 0.101 
 3 2013-01-03 Democrat   M        37 0.638 
 4 2013-01-03 Democrat   F        21 0.362 
 5 2015-01-03 Republican M         5 1     
 6 2015-01-03 Republican F         0 0     
 7 2015-01-03 Democrat   M         1 1     
 8 2015-01-03 Democrat   F         0 0     
 9 2017-01-03 Republican M        28 0.933 
10 2017-01-03 Republican F         2 0.0667
11 2017-01-03 Democrat   M        19 0.76  
12 2017-01-03 Democrat   F         6 0.24  
13 2019-01-03 Republican M        30 0.968 
14 2019-01-03 Republican F         1 0.0323
15 2019-01-03 Democrat   M        18 0.353 
16 2019-01-03 Democrat   F        33 0.647 
Code
df_c <- df |>
    group_by(start_year, party, sex) |>
    summarize(N = n()) |>
    mutate(freq = N / sum(N)) |>
    ungroup() |>
    complete(start_year, party, sex,
             fill = list(N = 0, freq = 0))
Code
df_c
# A tibble: 16 × 5
   start_year party      sex       N   freq
   <date>     <chr>      <chr> <int>  <dbl>
 1 2013-01-03 Democrat   F        21 0.362 
 2 2013-01-03 Democrat   M        37 0.638 
 3 2013-01-03 Republican F         8 0.101 
 4 2013-01-03 Republican M        71 0.899 
 5 2015-01-03 Democrat   F         0 0     
 6 2015-01-03 Democrat   M         1 1     
 7 2015-01-03 Republican F         0 0     
 8 2015-01-03 Republican M         5 1     
 9 2017-01-03 Democrat   F         6 0.24  
10 2017-01-03 Democrat   M        19 0.76  
11 2017-01-03 Republican F         2 0.0667
12 2017-01-03 Republican M        28 0.933 
13 2019-01-03 Democrat   F        33 0.647 
14 2019-01-03 Democrat   M        18 0.353 
15 2019-01-03 Republican F         1 0.0323
16 2019-01-03 Republican M        30 0.968 
Code
p_out <- df_c |> 
  ggplot(aes(x = start_year,
               y = freq,
               color = sex)) +
    geom_line(size = 1.1) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_manual(values = sex_colors, labels = c("Women", "Men")) +
    guides(color = guide_legend(reverse = TRUE)) +
    labs(x = "Year", y = "Percent", color = "Group") +
    facet_wrap(~ party)
Code
p_out