── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Crosstabs
Code
## library(socviz) # if not loadedgss_sm
# A tibble: 2,867 × 32
year id ballot age childs sibs degree race sex region income16
<dbl> <dbl> <labelled> <dbl> <dbl> <labe> <fct> <fct> <fct> <fct> <fct>
1 2016 1 1 47 3 2 Bache… White Male New E… $170000…
2 2016 2 2 61 0 3 High … White Male New E… $50000 …
3 2016 3 3 72 2 3 Bache… White Male New E… $75000 …
4 2016 4 1 43 4 3 High … White Fema… New E… $170000…
5 2016 5 3 55 2 2 Gradu… White Fema… New E… $170000…
6 2016 6 2 53 2 2 Junio… White Fema… New E… $60000 …
7 2016 7 1 50 2 2 High … White Male New E… $170000…
8 2016 8 3 23 3 6 High … Other Fema… Middl… $30000 …
9 2016 9 1 45 3 5 High … Black Male Middl… $60000 …
10 2016 10 3 71 4 1 Junio… White Male Middl… $60000 …
# ℹ 2,857 more rows
# ℹ 21 more variables: relig <fct>, marital <fct>, padeg <fct>, madeg <fct>,
# partyid <fct>, polviews <fct>, happy <fct>, partners <fct>, grass <fct>,
# zodiac <fct>, pres12 <labelled>, wtssall <dbl>, income_rc <fct>,
# agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>, religion <fct>,
# bigregion <fct>, partners_rc <fct>, obama <dbl>
# A tibble: 238 × 8
country year world opt consent_law consent_practice consistent ccode
<chr> <date> <chr> <chr> <chr> <chr> <chr> <chr>
1 Austral… NA Libe… In Informed Informed Yes Oz
2 Austral… 1991-01-01 Libe… In Informed Informed Yes Oz
3 Austral… 1992-01-01 Libe… In Informed Informed Yes Oz
4 Austral… 1993-01-01 Libe… In Informed Informed Yes Oz
5 Austral… 1994-01-01 Libe… In Informed Informed Yes Oz
6 Austral… 1995-01-01 Libe… In Informed Informed Yes Oz
7 Austral… 1996-01-01 Libe… In Informed Informed Yes Oz
8 Austral… 1997-01-01 Libe… In Informed Informed Yes Oz
9 Austral… 1998-01-01 Libe… In Informed Informed Yes Oz
10 Austral… 1999-01-01 Libe… In Informed Informed Yes Oz
# ℹ 228 more rows
# A tibble: 238 × 4
country year gdp gdp_lag
<chr> <date> <int> <int>
1 Australia NA 16774 16591
2 Australia 1991-01-01 17171 16774
3 Australia 1992-01-01 17914 17171
4 Australia 1993-01-01 18883 17914
5 Australia 1994-01-01 19849 18883
6 Australia 1995-01-01 21079 19849
7 Australia 1996-01-01 21923 21079
8 Australia 1997-01-01 22961 21923
9 Australia 1998-01-01 24148 22961
10 Australia 1999-01-01 25445 24148
# ℹ 228 more rows
Code
organdata |>filter(country =="Australia"| country =="Canada")
# A tibble: 28 × 21
country year donors pop pop_dens gdp gdp_lag health health_lag
<chr> <date> <dbl> <int> <dbl> <int> <int> <dbl> <dbl>
1 Australia NA NA 17065 0.220 16774 16591 1300 1224
2 Australia 1991-01-01 12.1 17284 0.223 17171 16774 1379 1300
3 Australia 1992-01-01 12.4 17495 0.226 17914 17171 1455 1379
4 Australia 1993-01-01 12.5 17667 0.228 18883 17914 1540 1455
5 Australia 1994-01-01 10.2 17855 0.231 19849 18883 1626 1540
6 Australia 1995-01-01 10.2 18072 0.233 21079 19849 1737 1626
7 Australia 1996-01-01 10.6 18311 0.237 21923 21079 1846 1737
8 Australia 1997-01-01 10.3 18518 0.239 22961 21923 1948 1846
9 Australia 1998-01-01 10.5 18711 0.242 24148 22961 2077 1948
10 Australia 1999-01-01 8.67 18926 0.244 25445 24148 2231 2077
# ℹ 18 more rows
# ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
# assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
# consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
# A tibble: 34 × 6
# Groups: race, sex [6]
race sex degree n mean_age mean_kids
<fct> <fct> <fct> <int> <dbl> <dbl>
1 White Male Lt High School 96 52.9 2.45
2 White Male High School 470 48.8 1.61
3 White Male Junior College 65 47.1 1.54
4 White Male Bachelor 208 48.6 1.35
5 White Male Graduate 112 56.0 1.71
6 White Female Lt High School 101 55.4 2.81
7 White Female High School 587 51.9 1.98
8 White Female Junior College 101 48.2 1.91
9 White Female Bachelor 218 49.2 1.44
10 White Female Graduate 138 53.6 1.38
# ℹ 24 more rows
Using across()
Starting to get repetitive. This is a warning sign:
my_vars <-c("gdp", "donors", "roads")## nested parens again, but it's worth itorgandata |>group_by(consent_law, country) |>## Tidyselect requires all_of() to ## make the selection explicitsummarize(across(all_of(my_vars),list(avg = mean),na.rm =TRUE))
Warning: There was 1 warning in `summarize()`.
ℹ In argument: `across(all_of(my_vars), list(avg = mean), na.rm = TRUE)`.
ℹ In group 1: `consent_law = "Informed"` and `country = "Australia"`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.
# Previously
across(a:b, mean, na.rm = TRUE)
# Now
across(a:b, \(x) mean(x, na.rm = TRUE))
Warning: There was 1 warning in `summarize()`.
ℹ In argument: `across(my_vars, list(avg = mean, sd = var, md = median), na.rm
= TRUE)`.
ℹ In group 1: `consent_law = "Informed"` and `country = "Australia"`.
Caused by warning:
! Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(my_vars)
# Now:
data %>% select(all_of(my_vars))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
organdata |>group_by(consent_law, country) |>summarize(across(where(is.numeric), list(mean = mean,var = var,median = median),na.rm =TRUE)) |>print(n =3) # just to save space here
# A tibble: 238 × 7
country world opt consent_law consent_practice consistent ccode
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
2 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
3 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
4 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
5 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
6 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
7 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
8 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
9 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
10 AUSTRALIA LIBERAL IN INFORMED INFORMED YES OZ
# ℹ 228 more rows
# A tibble: 17 × 3
# Groups: consent_law [2]
consent_law country donors
<chr> <chr> <dbl>
1 Presumed Spain 28.1
2 Presumed Austria 23.5
3 Presumed Belgium 21.9
4 Informed United States 20.0
5 Informed Ireland 19.8
# ℹ 12 more rows
slice_max et al
Code
organdata |>group_by(consent_law, country) |>summarize(donors =mean(donors, na.rm =TRUE)) |>slice_max(donors, n =5)
# A tibble: 10 × 3
# Groups: consent_law [2]
consent_law country donors
<chr> <chr> <dbl>
1 Informed United States 20.0
2 Informed Ireland 19.8
3 Informed Canada 14.0
4 Informed Netherlands 13.7
5 Informed United Kingdom 13.5
6 Presumed Spain 28.1
7 Presumed Austria 23.5
8 Presumed Belgium 21.9
9 Presumed Finland 18.4
10 Presumed France 16.8
Window functions
Code
## Data on COVID-19library(covdata)
Attaching package: 'covdata'
The following object is masked _by_ '.GlobalEnv':
%nin%
The following object is masked from 'package:socviz':
%nin%
The following object is masked from 'package:datasets':
uspop
Code
covnat_weekly
# A tibble: 4,966 × 11
date year_week cname iso3 pop cases deaths cu_cases cu_deaths
<date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2019-12-30 2020-01 Austria AUT 8932664 NA NA NA NA
2 2020-01-06 2020-02 Austria AUT 8932664 NA NA NA NA
3 2020-01-13 2020-03 Austria AUT 8932664 NA NA NA NA
4 2020-01-20 2020-04 Austria AUT 8932664 NA NA NA NA
5 2020-01-27 2020-05 Austria AUT 8932664 NA NA NA NA
6 2020-02-03 2020-06 Austria AUT 8932664 NA NA NA NA
7 2020-02-10 2020-07 Austria AUT 8932664 NA NA NA NA
8 2020-02-17 2020-08 Austria AUT 8932664 NA NA NA NA
9 2020-02-24 2020-09 Austria AUT 8932664 12 0 12 0
10 2020-03-02 2020-10 Austria AUT 8932664 115 0 127 0
# ℹ 4,956 more rows
# ℹ 2 more variables: r14_cases <dbl>, r14_deaths <dbl>
Code
covnat_weekly |>filter(iso3 =="FRA") |>select(date, cname, iso3, cases) |>mutate(cases =ifelse(is.na(cases), 0, cases), # convert NA vals in `cases` to 0cumulative =cumsum(cases))
# A tibble: 159 × 5
date cname iso3 cases cumulative
<date> <chr> <chr> <dbl> <dbl>
1 2019-12-30 France FRA 0 0
2 2020-01-06 France FRA 0 0
3 2020-01-13 France FRA 0 0
4 2020-01-20 France FRA 3 3
5 2020-01-27 France FRA 3 6
6 2020-02-03 France FRA 6 12
7 2020-02-10 France FRA 0 12
8 2020-02-17 France FRA 4 16
9 2020-02-24 France FRA 133 149
10 2020-03-02 France FRA 981 1130
# ℹ 149 more rows
Code
covnat_weekly |>select(date, cname, iso3, deaths) |>filter(iso3 =="FRA") |>filter(cume_dist(desc(deaths)) <0.1) # i.e. Top 10%
# A tibble: 15 × 4
date cname iso3 deaths
<date> <chr> <chr> <dbl>
1 2020-04-06 France FRA 3348
2 2020-10-26 France FRA 3517
3 2020-11-02 France FRA 5281
4 2020-11-09 France FRA 6018
5 2020-11-16 France FRA 6208
6 2020-11-23 France FRA 5215
7 2020-11-30 France FRA 4450
8 2020-12-07 France FRA 4257
9 2020-12-14 France FRA 3786
10 2020-12-21 France FRA 3560
11 2021-01-04 France FRA 3851
12 2021-01-11 France FRA 3833
13 2021-01-18 France FRA 3754
14 2021-01-25 France FRA 3535
15 2021-02-01 France FRA 3431
# A tibble: 371 × 7
# Groups: state [1]
date state fips data_quality_grade measure count measure_label
<date> <chr> <chr> <lgl> <chr> <dbl> <chr>
1 2021-03-07 NY 36 NA death 39029 Deaths
2 2021-03-06 NY 36 NA death 38970 Deaths
3 2021-03-05 NY 36 NA death 38891 Deaths
4 2021-03-04 NY 36 NA death 38796 Deaths
5 2021-03-03 NY 36 NA death 38735 Deaths
6 2021-03-02 NY 36 NA death 38660 Deaths
7 2021-03-01 NY 36 NA death 38577 Deaths
8 2021-02-28 NY 36 NA death 38497 Deaths
9 2021-02-27 NY 36 NA death 38407 Deaths
10 2021-02-26 NY 36 NA death 38321 Deaths
# ℹ 361 more rows
# A tibble: 371 × 7
# Groups: state [1]
date state fips measure count measure_label deaths_daily
<date> <chr> <chr> <chr> <dbl> <chr> <dbl>
1 2021-03-07 NY 36 death 39029 Deaths 59
2 2021-03-06 NY 36 death 38970 Deaths 79
3 2021-03-05 NY 36 death 38891 Deaths 95
4 2021-03-04 NY 36 death 38796 Deaths 61
5 2021-03-03 NY 36 death 38735 Deaths 75
6 2021-03-02 NY 36 death 38660 Deaths 83
7 2021-03-01 NY 36 death 38577 Deaths 80
8 2021-02-28 NY 36 death 38497 Deaths 90
9 2021-02-27 NY 36 death 38407 Deaths 86
10 2021-02-26 NY 36 death 38321 Deaths 94
# ℹ 361 more rows
Writing your own basic function
Code
my_fun <-function(x) { x +1}my_fun # we've created the function; it's just an object
function(x) {
x + 1
}
Code
my_fun(x =1) # But we can supply it with an input!
# A tibble: 371 × 7
# Groups: state [1]
date state fips measure count measure_label deaths_daily
<date> <chr> <chr> <chr> <dbl> <chr> <dbl>
1 2021-03-07 NY 36 death 39029 Deaths 59
2 2021-03-06 NY 36 death 38970 Deaths 79
3 2021-03-05 NY 36 death 38891 Deaths 95
4 2021-03-04 NY 36 death 38796 Deaths 61
5 2021-03-03 NY 36 death 38735 Deaths 75
6 2021-03-02 NY 36 death 38660 Deaths 83
7 2021-03-01 NY 36 death 38577 Deaths 80
8 2021-02-28 NY 36 death 38497 Deaths 90
9 2021-02-27 NY 36 death 38407 Deaths 86
10 2021-02-26 NY 36 death 38321 Deaths 94
# ℹ 361 more rows
# A tibble: 371 × 8
# Groups: state [1]
date state fips measure count measure_label deaths_daily deaths7
<date> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
1 2021-03-07 NY 36 death 39029 Deaths 59 77.8
2 2021-03-06 NY 36 death 38970 Deaths 79 81.1
3 2021-03-05 NY 36 death 38891 Deaths 95 83
4 2021-03-04 NY 36 death 38796 Deaths 61 82.6
5 2021-03-03 NY 36 death 38735 Deaths 75 88
6 2021-03-02 NY 36 death 38660 Deaths 83 89.9
7 2021-03-01 NY 36 death 38577 Deaths 80 90.8
8 2021-02-28 NY 36 death 38497 Deaths 90 90.1
9 2021-02-27 NY 36 death 38407 Deaths 86 91.5
10 2021-02-26 NY 36 death 38321 Deaths 94 95.6
# ℹ 361 more rows
Functions for tidying up columns
Code
gss_sm
# A tibble: 2,867 × 32
year id ballot age childs sibs degree race sex region income16
<dbl> <dbl> <labelled> <dbl> <dbl> <labe> <fct> <fct> <fct> <fct> <fct>
1 2016 1 1 47 3 2 Bache… White Male New E… $170000…
2 2016 2 2 61 0 3 High … White Male New E… $50000 …
3 2016 3 3 72 2 3 Bache… White Male New E… $75000 …
4 2016 4 1 43 4 3 High … White Fema… New E… $170000…
5 2016 5 3 55 2 2 Gradu… White Fema… New E… $170000…
6 2016 6 2 53 2 2 Junio… White Fema… New E… $60000 …
7 2016 7 1 50 2 2 High … White Male New E… $170000…
8 2016 8 3 23 3 6 High … Other Fema… Middl… $30000 …
9 2016 9 1 45 3 5 High … Black Male Middl… $60000 …
10 2016 10 3 71 4 1 Junio… White Male Middl… $60000 …
# ℹ 2,857 more rows
# ℹ 21 more variables: relig <fct>, marital <fct>, padeg <fct>, madeg <fct>,
# partyid <fct>, polviews <fct>, happy <fct>, partners <fct>, grass <fct>,
# zodiac <fct>, pres12 <labelled>, wtssall <dbl>, income_rc <fct>,
# agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>, religion <fct>,
# bigregion <fct>, partners_rc <fct>, obama <dbl>
Rows: 280 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): party, sex
dbl (1): pid
date (1): start_year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
df
# A tibble: 280 × 4
pid start_year party sex
<dbl> <date> <chr> <chr>
1 3160 2013-01-03 Republican M
2 3161 2013-01-03 Democrat F
3 3162 2013-01-03 Democrat M
4 3163 2013-01-03 Republican M
5 3164 2013-01-03 Democrat M
6 3165 2013-01-03 Republican M
7 3166 2013-01-03 Republican M
8 3167 2013-01-03 Democrat F
9 3168 2013-01-03 Republican M
10 3169 2013-01-03 Democrat M
# ℹ 270 more rows
Code
df |>group_by(start_year, party, sex) |>summarize(N =n()) |>mutate(freq = N /sum(N))
# A tibble: 14 × 5
# Groups: start_year, party [8]
start_year party sex N freq
<date> <chr> <chr> <int> <dbl>
1 2013-01-03 Democrat F 21 0.362
2 2013-01-03 Democrat M 37 0.638
3 2013-01-03 Republican F 8 0.101
4 2013-01-03 Republican M 71 0.899
5 2015-01-03 Democrat M 1 1
6 2015-01-03 Republican M 5 1
7 2017-01-03 Democrat F 6 0.24
8 2017-01-03 Democrat M 19 0.76
9 2017-01-03 Republican F 2 0.0667
10 2017-01-03 Republican M 28 0.933
11 2019-01-03 Democrat F 33 0.647
12 2019-01-03 Democrat M 18 0.353
13 2019-01-03 Republican F 1 0.0323
14 2019-01-03 Republican M 30 0.968