library(here) # manage file paths
library(socviz) # data and some useful functions
library(tidyverse) # your friend and mine
Duke University
February 14, 2024
Moar Data
files/data/
is a folder named congress/
/Users/kjhealy/Documents/courses/socdata.co/files/data/congress
├── 01_79_congress.csv
├── 02_80_congress.csv
├── 03_81_congress.csv
├── 04_82_congress.csv
├── 05_83_congress.csv
├── 06_84_congress.csv
├── 07_85_congress.csv
├── 08_86_congress.csv
├── 09_87_congress.csv
├── 10_88_congress.csv
├── 11_89_congress.csv
├── 12_90_congress.csv
├── 13_91_congress.csv
├── 14_92_congress.csv
├── 15_93_congress.csv
├── 16_94_congress.csv
├── 17_95_congress.csv
├── 18_96_congress.csv
├── 19_97_congress.csv
├── 20_98_congress.csv
├── 21_99_congress.csv
├── 22_100_congress.csv
├── 23_101_congress.csv
├── 24_102_congress.csv
├── 25_103_congress.csv
├── 26_104_congress.csv
├── 27_105_congress.csv
├── 28_106_congress.csv
├── 29_107_congress.csv
├── 30_108_congress.csv
├── 31_109_congress.csv
├── 32_110_congress.csv
├── 33_111_congress.csv
├── 34_112_congress.csv
├── 35_113_congress.csv
├── 36_114_congress.csv
├── 37_115_congress.csv
└── 38_116_congress.csv
Let’s look at one.
read_csv(here("files", "data", "congress", "17_95_congress.csv")) |>
janitor::clean_names() |>
head()
# A tibble: 6 × 25
last first middle suffix nickname born death sex position party state
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Abdnor James <NA> <NA> <NA> 02/1… 11/0… M U.S. Re… Repu… SD
2 Abourezk James George <NA> <NA> 02/2… <NA> M U.S. Se… Demo… SD
3 Adams Brockm… <NA> <NA> Brock 01/1… 09/1… M U.S. Re… Demo… WA
4 Addabbo Joseph Patri… <NA> <NA> 03/1… 04/1… M U.S. Re… Demo… NY
5 Aiken George David <NA> <NA> 08/2… 11/1… M U.S. Se… Repu… VT
6 Akaka Daniel Kahik… <NA> <NA> 09/1… 04/0… M U.S. Re… Demo… HI
# ℹ 14 more variables: district <chr>, start <chr>, end <chr>, religion <chr>,
# race <chr>, educational_attainment <chr>, job_type1 <chr>, job_type2 <chr>,
# job_type3 <chr>, job_type4 <chr>, job_type5 <lgl>, mil1 <chr>, mil2 <chr>,
# mil3 <chr>
We often find ourselves in this situation. We know each file has the same structure, and we would like to use them all at once.
How to read them all in?
One traditional way, which we could do in R, is to write an explicit loop that iterated over a vector of filenames, read each file, and then joined the results together in a tall rectangle.
map()
as a way of iterating without writing loops. You start with a vector of things. You feed it one thing at a time to some function. The function does whatever it does. You get back output that is the same length as your input, and of a specific type.purrr
package provides a big family of mapping functions. One reason there are a lot of them is that purrr
, like the rest of the tidyverse, is picky about data types.map()
, which always returns a list, we also have map_chr()
, map_int()
, map_dbl()
, map_lgl()
and others. They always return the data type indicated by their suffix, or die trying.The simplest cases are not that different from the vectorized arithmetic we’re already familiar with.
R’s vectorized rules add b
to every element of a
. In a sense, the +
operation can be thought of as a function that takes each element of a
and does something with it. In this case “add b
”.
We can make this explicit by writing a function:
We can make this explicit by writing a function:
Now:
Again, R’s vectorized approach means it automatically adds b
to every element of the x we give it.
Some operations can’t directly be vectorized in this way, which is why we need to manually iterate, or will want to write loops.
library(gapminder)
gapminder |>
summarize(country_n = n_distinct(country),
continent_n = n_distinct(continent),
year_n = n_distinct(year),
lifeExp_n = n_distinct(lifeExp),
population_n = n_distinct(population))
# A tibble: 1 × 5
country_n continent_n year_n lifeExp_n population_n
<int> <int> <int> <int> <int>
1 142 5 12 1626 4060
That’s tedious to write! Computers are supposed to allow us to avoid that sort of thing.
So how would we iterate this? What we want is to apply the n_distinct()
function to each column of gapminder
, but in a way that still allows us to use pipelines and so on.
library(gapminder)
gapminder |>
summarize(n_distinct(country),
n_distinct(continent),
n_distinct(year),
n_distinct(lifeExp),
n_distinct(population))
# A tibble: 1 × 5
`n_distinct(country)` `n_distinct(continent)` `n_distinct(year)`
<int> <int> <int>
1 142 5 12
# ℹ 2 more variables: `n_distinct(lifeExp)` <int>,
# `n_distinct(population)` <int>
You’d use across(), like this:
But you could also do this …
Or, in pipeline form:
But we know n_distinct()
should always return an integer. So we use map_int()
instead of the generic map()
.
country continent year lifeExp pop gdpPercap
142 5 12 1626 1704 1704
The thing about the map()
family is that they can deal with all kinds of input types and output types.
filenames <- dir(path = here("files", "data", "congress"),
pattern = "*.csv",
full.names = TRUE)
filenames[1:15] # Just displaying the first 15, to save slide space
[1] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/01_79_congress.csv"
[2] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/02_80_congress.csv"
[3] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/03_81_congress.csv"
[4] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/04_82_congress.csv"
[5] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/05_83_congress.csv"
[6] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/06_84_congress.csv"
[7] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/07_85_congress.csv"
[8] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/08_86_congress.csv"
[9] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/09_87_congress.csv"
[10] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/10_88_congress.csv"
[11] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/11_89_congress.csv"
[12] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/12_90_congress.csv"
[13] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/13_91_congress.csv"
[14] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/14_92_congress.csv"
[15] "/Users/kjhealy/Documents/courses/socdata.co/files/data/congress/15_93_congress.csv"
read_csv()
… using map()
and binding the resulting list into a tibble.
# A tibble: 20,580 × 26
congress last first middle suffix nickname born death sex position party
<int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 Abern… Thom… Gerst… <NA> <NA> 05/1… 01/2… M U.S. Re… Demo…
2 1 Adams Sher… <NA> <NA> <NA> 01/0… 10/2… M U.S. Re… Repu…
3 1 Aiken Geor… David <NA> <NA> 08/2… 11/1… M U.S. Se… Repu…
4 1 Allen Asa Leona… <NA> <NA> 01/0… 01/0… M U.S. Re… Demo…
5 1 Allen Leo Elwood <NA> <NA> 10/0… 01/1… M U.S. Re… Repu…
6 1 Almond J. Linds… Jr. <NA> 06/1… 04/1… M U.S. Re… Demo…
7 1 Ander… Herm… Carl <NA> <NA> 01/2… 07/2… M U.S. Re… Repu…
8 1 Ander… Clin… Presba <NA> <NA> 10/2… 11/1… M U.S. Re… Demo…
9 1 Ander… John Zuing… <NA> <NA> 03/2… 02/0… M U.S. Re… Repu…
10 1 Andre… Augu… Herman <NA> <NA> 10/1… 01/1… M U.S. Re… Repu…
# ℹ 20,570 more rows
# ℹ 15 more variables: state <chr>, district <chr>, start <chr>, end <chr>,
# religion <chr>, race <chr>, educational_attainment <chr>, job_type1 <chr>,
# job_type2 <chr>, job_type3 <chr>, job_type4 <chr>, job_type5 <chr>,
# mil1 <chr>, mil2 <chr>, mil3 <chr>
read_csv()
can do this directlyIn fact map()
is not required for this particular use:
tmp <- read_csv(filenames, id = "path",
name_repair = janitor::make_clean_names)
tmp |>
mutate(congress = stringr::str_extract(path, "_\\d{2,3}_congress"),
congress = stringr::str_extract(congress, "\\d{2,3}")) |>
relocate(congress)
# A tibble: 20,580 × 27
congress path last first middle suffix nickname born death sex position
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 79 /User… Aber… Thom… Gerst… <NA> <NA> 05/1… 01/2… M U.S. Re…
2 79 /User… Adams Sher… <NA> <NA> <NA> 01/0… 10/2… M U.S. Re…
3 79 /User… Aiken Geor… David <NA> <NA> 08/2… 11/1… M U.S. Se…
4 79 /User… Allen Asa Leona… <NA> <NA> 01/0… 01/0… M U.S. Re…
5 79 /User… Allen Leo Elwood <NA> <NA> 10/0… 01/1… M U.S. Re…
6 79 /User… Almo… J. Linds… Jr. <NA> 06/1… 04/1… M U.S. Re…
7 79 /User… Ande… Herm… Carl <NA> <NA> 01/2… 07/2… M U.S. Re…
8 79 /User… Ande… Clin… Presba <NA> <NA> 10/2… 11/1… M U.S. Re…
9 79 /User… Ande… John Zuing… <NA> <NA> 03/2… 02/0… M U.S. Re…
10 79 /User… Andr… Augu… Herman <NA> <NA> 10/1… 01/1… M U.S. Re…
# ℹ 20,570 more rows
# ℹ 16 more variables: party <chr>, state <chr>, district <chr>, start <chr>,
# end <chr>, religion <chr>, race <chr>, educational_attainment <chr>,
# job_type1 <chr>, job_type2 <chr>, job_type3 <chr>, job_type4 <chr>,
# job_type5 <chr>, mil1 <chr>, mil2 <chr>, mil3 <chr>
Mapped iteration is very general, and not just for local files
All counties in New York State for a specific year
# A tibble: 38 × 5
GEOID NAME variable estimate moe
<chr> <chr> <chr> <dbl> <dbl>
1 36001 Albany County, New York B19013_001 50054 2030
2 36005 Bronx County, New York B19013_001 29228 853
3 36007 Broome County, New York B19013_001 36394 2340
4 36009 Cattaraugus County, New York B19013_001 37580 2282
5 36011 Cayuga County, New York B19013_001 42057 2406
6 36013 Chautauqua County, New York B19013_001 35495 2077
7 36015 Chemung County, New York B19013_001 37418 3143
8 36019 Clinton County, New York B19013_001 44757 3500
9 36027 Dutchess County, New York B19013_001 61889 2431
10 36029 Erie County, New York B19013_001 41967 1231
# ℹ 28 more rows
What if we want the results for every available year? First, a handy function: set_names()
By default, set_names()
will label a vector with that vector’s values:
This works with map()
just fine:
# A tibble: 580 × 6
year GEOID NAME variable estimate moe
<int> <chr> <chr> <chr> <dbl> <dbl>
1 1 36001 Albany County, New York B19013_001 50054 2030
2 1 36005 Bronx County, New York B19013_001 29228 853
3 1 36007 Broome County, New York B19013_001 36394 2340
4 1 36009 Cattaraugus County, New York B19013_001 37580 2282
5 1 36011 Cayuga County, New York B19013_001 42057 2406
6 1 36013 Chautauqua County, New York B19013_001 35495 2077
7 1 36015 Chemung County, New York B19013_001 37418 3143
8 1 36019 Clinton County, New York B19013_001 44757 3500
9 1 36027 Dutchess County, New York B19013_001 61889 2431
10 1 36029 Erie County, New York B19013_001 41967 1231
# ℹ 570 more rows
Our id
column tracks the year. But we’d like it to be the year. So, we use set_names()
:
# A tibble: 580 × 6
year GEOID NAME variable estimate moe
<int> <chr> <chr> <chr> <dbl> <dbl>
1 2005 36001 Albany County, New York B19013_001 50054 2030
2 2005 36005 Bronx County, New York B19013_001 29228 853
3 2005 36007 Broome County, New York B19013_001 36394 2340
4 2005 36009 Cattaraugus County, New York B19013_001 37580 2282
5 2005 36011 Cayuga County, New York B19013_001 42057 2406
6 2005 36013 Chautauqua County, New York B19013_001 35495 2077
7 2005 36015 Chemung County, New York B19013_001 37418 3143
8 2005 36019 Clinton County, New York B19013_001 44757 3500
9 2005 36027 Dutchess County, New York B19013_001 61889 2431
10 2005 36029 Erie County, New York B19013_001 41967 1231
# ℹ 570 more rows
Now year
is just the year. The year
column will be created as a character vector, so we converted it back to an integer again at the end.
p_out <- 2005:2019 |>
set_names() |>
map(\(x) get_acs(geography = "county",
variables = "B19013_001",
state = "NY",
survey = "acs1",
year = x)) |>
list_rbind(names_to = "year") |>
mutate(year = as.integer(year)) |>
ggplot(mapping = aes(x = year, y = estimate, group = year)) +
geom_boxplot(fill = "lightblue", alpha = 0.5, outlier.alpha = 0) +
geom_jitter(position = position_jitter(width = 0.1), shape = 1) +
scale_y_continuous(labels = scales::label_dollar()) +
labs(x = "Year", y = "Dollars",
title = "Median Household Income by County in New York State, 2005-2019",
subtitle = "ACS 1-year estimates", caption = "Data: U.S. Census Bureau.")
df <- filenames |>
map(read_csv) |>
list_rbind(names_to = "congress") |>
janitor::clean_names()
df |>
select(born, death, start, end)
# A tibble: 20,580 × 4
born death start end
<chr> <chr> <chr> <chr>
1 05/16/1903 01/23/1953 01/03/1945 01/03/1953
2 01/08/1899 10/27/1986 01/03/1945 01/03/1947
3 08/20/1892 11/19/1984 01/03/1945 01/03/1979
4 01/05/1891 01/05/1969 01/03/1945 01/03/1953
5 10/05/1898 01/19/1973 01/03/1945 01/02/1949
6 06/15/1898 04/14/1986 02/04/1946 04/17/1948
7 01/27/1897 07/26/1978 01/03/1945 01/03/1963
8 10/23/1895 11/11/1975 01/03/1941 06/30/1945
9 03/22/1904 02/09/1981 01/03/1945 01/03/1953
10 10/11/1890 01/14/1958 01/03/1945 01/14/1958
# ℹ 20,570 more rows
We’ll use the lubridate package to sort these out.
Lubridate has a wide range of functions to handle dates, times, and durations.
library(lubridate)
date_recodes <- c("born", "death", "start", "end")
df <- df |>
mutate(across(any_of(date_recodes), mdy),
congress = as.integer(congress) + 78)
df
# A tibble: 20,580 × 26
congress last first middle suffix nickname born death sex
<dbl> <chr> <chr> <chr> <chr> <chr> <date> <date> <chr>
1 79 Abernethy Thomas Gerst… <NA> <NA> 1903-05-16 1953-01-23 M
2 79 Adams Sherman <NA> <NA> <NA> 1899-01-08 1986-10-27 M
3 79 Aiken George David <NA> <NA> 1892-08-20 1984-11-19 M
4 79 Allen Asa Leona… <NA> <NA> 1891-01-05 1969-01-05 M
5 79 Allen Leo Elwood <NA> <NA> 1898-10-05 1973-01-19 M
6 79 Almond J. Linds… Jr. <NA> 1898-06-15 1986-04-14 M
7 79 Andersen Herman Carl <NA> <NA> 1897-01-27 1978-07-26 M
8 79 Anderson Clinton Presba <NA> <NA> 1895-10-23 1975-11-11 M
9 79 Anderson John Zuing… <NA> <NA> 1904-03-22 1981-02-09 M
10 79 Andresen August Herman <NA> <NA> 1890-10-11 1958-01-14 M
# ℹ 20,570 more rows
# ℹ 17 more variables: position <chr>, party <chr>, state <chr>,
# district <chr>, start <date>, end <date>, religion <chr>, race <chr>,
# educational_attainment <chr>, job_type1 <chr>, job_type2 <chr>,
# job_type3 <chr>, job_type4 <chr>, job_type5 <chr>, mil1 <chr>, mil2 <chr>,
# mil3 <chr>
sessions <- tibble(congress = 79:116,
start_year = seq(1945, 2019, by = 2),
end_year = seq(1947, 2021, by = 2)) |>
mutate(start_year = ymd(paste(start_year, "01", "03", sep = "-")),
end_year = ymd(paste(end_year, "01", "03", sep = "-")))
sessions
# A tibble: 38 × 3
congress start_year end_year
<int> <date> <date>
1 79 1945-01-03 1947-01-03
2 80 1947-01-03 1949-01-03
3 81 1949-01-03 1951-01-03
4 82 1951-01-03 1953-01-03
5 83 1953-01-03 1955-01-03
6 84 1955-01-03 1957-01-03
7 85 1957-01-03 1959-01-03
8 86 1959-01-03 1961-01-03
9 87 1961-01-03 1963-01-03
10 88 1963-01-03 1965-01-03
# ℹ 28 more rows
The big table:
# A tibble: 20,580 × 3
congress last born
<dbl> <chr> <date>
1 79 Abernethy 1903-05-16
2 79 Adams 1899-01-08
3 79 Aiken 1892-08-20
4 79 Allen 1891-01-05
5 79 Allen 1898-10-05
6 79 Almond 1898-06-15
7 79 Andersen 1897-01-27
8 79 Anderson 1895-10-23
9 79 Anderson 1904-03-22
10 79 Andresen 1890-10-11
# ℹ 20,570 more rows
The smaller table
# A tibble: 38 × 3
congress start_year end_year
<int> <date> <date>
1 79 1945-01-03 1947-01-03
2 80 1947-01-03 1949-01-03
3 81 1949-01-03 1951-01-03
4 82 1951-01-03 1953-01-03
5 83 1953-01-03 1955-01-03
6 84 1955-01-03 1957-01-03
7 85 1957-01-03 1959-01-03
8 86 1959-01-03 1961-01-03
9 87 1961-01-03 1963-01-03
10 88 1963-01-03 1965-01-03
# ℹ 28 more rows
We will use left_join()
which is what you want most of the time when you are looking to merge a smaller table with additional information into a larger main one.
Joining with `by = join_by(congress)`
# A tibble: 20,580 × 28
congress start_year end_year last first middle suffix nickname born
<dbl> <date> <date> <chr> <chr> <chr> <chr> <chr> <date>
1 79 1945-01-03 1947-01-03 Abern… Thom… Gerst… <NA> <NA> 1903-05-16
2 79 1945-01-03 1947-01-03 Adams Sher… <NA> <NA> <NA> 1899-01-08
3 79 1945-01-03 1947-01-03 Aiken Geor… David <NA> <NA> 1892-08-20
4 79 1945-01-03 1947-01-03 Allen Asa Leona… <NA> <NA> 1891-01-05
5 79 1945-01-03 1947-01-03 Allen Leo Elwood <NA> <NA> 1898-10-05
6 79 1945-01-03 1947-01-03 Almond J. Linds… Jr. <NA> 1898-06-15
7 79 1945-01-03 1947-01-03 Ander… Herm… Carl <NA> <NA> 1897-01-27
8 79 1945-01-03 1947-01-03 Ander… Clin… Presba <NA> <NA> 1895-10-23
9 79 1945-01-03 1947-01-03 Ander… John Zuing… <NA> <NA> 1904-03-22
10 79 1945-01-03 1947-01-03 Andre… Augu… Herman <NA> <NA> 1890-10-11
# ℹ 20,570 more rows
# ℹ 19 more variables: death <date>, sex <chr>, position <chr>, party <chr>,
# state <chr>, district <chr>, start <date>, end <date>, religion <chr>,
# race <chr>, educational_attainment <chr>, job_type1 <chr>, job_type2 <chr>,
# job_type3 <chr>, job_type4 <chr>, job_type5 <chr>, mil1 <chr>, mil2 <chr>,
# mil3 <chr>
Most of the time you will be looking to make a left_join()
==
The result of almost any operation involving a missing/unknown value will be missing/unknown.
==
The result of almost any operation involving a missing/unknown value will be missing/unknown.
==
The result of almost any operation involving a missing/unknown value will be missing/unknown.
==
The result of almost any operation involving a missing/unknown value will be missing/unknown.
==
Always use is.na()
instead
# A tibble: 238 × 21
country year donors pop pop_dens gdp gdp_lag health health_lag
<chr> <date> <dbl> <int> <dbl> <int> <int> <dbl> <dbl>
1 Australia NA NA 17065 0.220 16774 16591 1300 1224
2 Australia 1991-01-01 12.1 17284 0.223 17171 16774 1379 1300
3 Australia 1992-01-01 12.4 17495 0.226 17914 17171 1455 1379
4 Australia 1993-01-01 12.5 17667 0.228 18883 17914 1540 1455
5 Australia 1994-01-01 10.2 17855 0.231 19849 18883 1626 1540
6 Australia 1995-01-01 10.2 18072 0.233 21079 19849 1737 1626
7 Australia 1996-01-01 10.6 18311 0.237 21923 21079 1846 1737
8 Australia 1997-01-01 10.3 18518 0.239 22961 21923 1948 1846
9 Australia 1998-01-01 10.5 18711 0.242 24148 22961 2077 1948
10 Australia 1999-01-01 8.67 18926 0.244 25445 24148 2231 2077
# ℹ 228 more rows
# ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
# assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
# consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
organdata |>
select(consent_law, year, pubhealth, roads) |>
group_by(consent_law) |>
miss_var_summary()
# A tibble: 6 × 4
# Groups: consent_law [2]
consent_law variable n_miss pct_miss
<chr> <chr> <int> <dbl>
1 Informed year 16 14.3
2 Informed pubhealth 8 7.14
3 Informed roads 8 7.14
4 Presumed year 18 14.3
5 Presumed pubhealth 13 10.3
6 Presumed roads 9 7.14
# An Excel file!
dat <- readxl::read_xlsx(here("files", "data", "symptoms.xlsx"))
dat |> print(n = nrow(dat))
# A tibble: 32 × 2
combination count
<chr> <dbl>
1 Anosmia 140
2 Cough 57
3 Fatigue 198
4 Diarrhea 12
5 Breath 5
6 Fever 11
7 Cough&Fatigue 179
8 Fatigue&Fever 28
9 Breath&Fatigue 10
10 Diarrhea&Fatigue 43
11 Anosmia&Fatigue 281
12 Breath&Cough 1
13 Anosmia&Diarrhea&Fatigue 64
14 Breath&Cough&Fatigue 22
15 Anosmia&Cough&Fatigue 259
16 Anosmia&Fever&Fatigue 46
17 Cough&Fever&Fatigue 54
18 Cough&Diarrhea 7
19 Cough&Diarrhea&Fatigue 31
20 Anosmia&Breath&Cough&Fatigue 26
21 Anosmia&Cough&Fatigue&Fever 69
22 Anosmia&Breath&Cough&Diarrhea&Fatigue 18
23 Anosmia&Breath&Cough&Fatigue&Fever 17
24 Breath&Cough&Fatigue&Fever 11
25 Breath&Cough&Diarrhea&Fatigue 7
26 Breath&Cough&Diarrhea&Fatigue&Fever 8
27 Diarrhea&Fatigue&Fever 12
28 Cough&Diarrhea&Fatigue&Fever 17
29 Anosmia&Diarrhea&Fatigue&Fever 17
30 Anosmia&Diarrhea&Cough&Fatigue 41
31 Anosmia&Breath&Cough&Diarrhea&Fatigue&Fever 23
32 Anosmia&Cough&Diarrhea&Fatigue&Fever 50
Now we have a table we can do something with.
# A tibble: 32 × 8
subset Anosmia Cough Fatigue Diarrhea Breath Fever count
<chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <dbl>
1 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE 140
2 Cough FALSE TRUE FALSE FALSE FALSE FALSE 57
3 Fatigue FALSE FALSE TRUE FALSE FALSE FALSE 198
4 Diarrhea FALSE FALSE FALSE TRUE FALSE FALSE 12
5 Breath FALSE FALSE FALSE FALSE TRUE FALSE 5
6 Fever FALSE FALSE FALSE FALSE FALSE TRUE 11
7 Cough&Fatigue FALSE TRUE TRUE FALSE FALSE FALSE 179
8 Fatigue&Fever FALSE FALSE TRUE FALSE FALSE TRUE 28
9 Breath&Fatigue FALSE FALSE TRUE FALSE TRUE FALSE 10
10 Diarrhea&Fatigue FALSE FALSE TRUE TRUE FALSE FALSE 43
11 Anosmia&Fatigue TRUE FALSE TRUE FALSE FALSE FALSE 281
12 Breath&Cough FALSE TRUE FALSE FALSE TRUE FALSE 1
13 Anosmia&Diarrhea&Fatigue TRUE FALSE TRUE TRUE FALSE FALSE 64
14 Breath&Cough&Fatigue FALSE TRUE TRUE FALSE TRUE FALSE 22
15 Anosmia&Cough&Fatigue TRUE TRUE TRUE FALSE FALSE FALSE 259
16 Anosmia&Fever&Fatigue TRUE FALSE TRUE FALSE FALSE TRUE 46
17 Cough&Fever&Fatigue FALSE TRUE TRUE FALSE FALSE TRUE 54
18 Cough&Diarrhea FALSE TRUE FALSE TRUE FALSE FALSE 7
19 Cough&Diarrhea&Fatigue FALSE TRUE TRUE TRUE FALSE FALSE 31
20 Anosmia&Breath&Cough&Fatig… TRUE TRUE TRUE FALSE TRUE FALSE 26
21 Anosmia&Cough&Fatigue&Fever TRUE TRUE TRUE FALSE FALSE TRUE 69
22 Anosmia&Breath&Cough&Diarr… TRUE TRUE TRUE TRUE TRUE FALSE 18
23 Anosmia&Breath&Cough&Fatig… TRUE TRUE TRUE FALSE TRUE TRUE 17
24 Breath&Cough&Fatigue&Fever FALSE TRUE TRUE FALSE TRUE TRUE 11
25 Breath&Cough&Diarrhea&Fati… FALSE TRUE TRUE TRUE TRUE FALSE 7
26 Breath&Cough&Diarrhea&Fati… FALSE TRUE TRUE TRUE TRUE TRUE 8
27 Diarrhea&Fatigue&Fever FALSE FALSE TRUE TRUE FALSE TRUE 12
28 Cough&Diarrhea&Fatigue&Fev… FALSE TRUE TRUE TRUE FALSE TRUE 17
29 Anosmia&Diarrhea&Fatigue&F… TRUE FALSE TRUE TRUE FALSE TRUE 17
30 Anosmia&Diarrhea&Cough&Fat… TRUE TRUE TRUE TRUE FALSE FALSE 41
31 Anosmia&Breath&Cough&Diarr… TRUE TRUE TRUE TRUE TRUE TRUE 23
32 Anosmia&Cough&Diarrhea&Fat… TRUE TRUE TRUE TRUE FALSE TRUE 50
Uncounting tables:
# A tibble: 1,764 × 7
subset Anosmia Cough Fatigue Diarrhea Breath Fever
<chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
2 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
3 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
4 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
5 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
6 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
7 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
8 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
9 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
10 Anosmia TRUE FALSE FALSE FALSE FALSE FALSE
# ℹ 1,754 more rows
Now we’ve reconstructed the individual-level observations.
# devtools::install_github("krassowski/complex-upset")
library(ComplexUpset)
upset(data = indvs, intersect = symptoms,
name="Symptom Groupings by Frequency. Total pool is 1,764 individuals.",
min_size = 0,
width_ratio = 0.125) +
labs(title = "Co-Occurence of COVID-19 Symptoms",
caption = "Data: covid.joinzoe.com/us | Graph: @kjhealy")
We can’t do anything with this, programatically.
Call:
lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
Residuals:
Min 1Q Median 3Q Max
-49.161 -4.486 0.297 5.110 25.175
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.781e+01 3.395e-01 140.819 < 2e-16 ***
gdpPercap 4.495e-04 2.346e-05 19.158 < 2e-16 ***
pop 6.570e-09 1.975e-09 3.326 0.000901 ***
continentAmericas 1.348e+01 6.000e-01 22.458 < 2e-16 ***
continentAsia 8.193e+00 5.712e-01 14.342 < 2e-16 ***
continentEurope 1.747e+01 6.246e-01 27.973 < 2e-16 ***
continentOceania 1.808e+01 1.782e+00 10.146 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.365 on 1697 degrees of freedom
Multiple R-squared: 0.5821, Adjusted R-squared: 0.5806
F-statistic: 393.9 on 6 and 1697 DF, p-value: < 2.2e-16
# A tibble: 7 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 4.78e+1 0.340 141. 0
2 gdpPercap 4.50e-4 0.0000235 19.2 3.24e- 74
3 pop 6.57e-9 0.00000000198 3.33 9.01e- 4
4 continentAmericas 1.35e+1 0.600 22.5 5.19e- 98
5 continentAsia 8.19e+0 0.571 14.3 4.06e- 44
6 continentEurope 1.75e+1 0.625 28.0 6.34e-142
7 continentOceania 1.81e+1 1.78 10.1 1.59e- 23
That’s a lot nicer. Now it’s just a tibble. We know those.
# A tibble: 7 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 4.78e+1 3.40e-1 141. 0 4.71e+1 4.85e+1
2 gdpPercap 4.50e-4 2.35e-5 19.2 3.24e- 74 4.03e-4 4.96e-4
3 pop 6.57e-9 1.98e-9 3.33 9.01e- 4 2.70e-9 1.04e-8
4 continentAmericas 1.35e+1 6.00e-1 22.5 5.19e- 98 1.23e+1 1.47e+1
5 continentAsia 8.19e+0 5.71e-1 14.3 4.06e- 44 7.07e+0 9.31e+0
6 continentEurope 1.75e+1 6.25e-1 28.0 6.34e-142 1.62e+1 1.87e+1
7 continentOceania 1.81e+1 1.78e+0 10.1 1.59e- 23 1.46e+1 2.16e+1
out_conf |>
filter(term %nin% "(Intercept)") |>
mutate(nicelabs = prefix_strip(term, "continent")) |>
select(nicelabs, everything())
# A tibble: 6 × 8
nicelabs term estimate std.error statistic p.value conf.low conf.high
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 gdpPercap gdpPercap 4.50e-4 2.35e-5 19.2 3.24e- 74 4.03e-4 4.96e-4
2 Pop pop 6.57e-9 1.98e-9 3.33 9.01e- 4 2.70e-9 1.04e-8
3 Americas continent… 1.35e+1 6.00e-1 22.5 5.19e- 98 1.23e+1 1.47e+1
4 Asia continent… 8.19e+0 5.71e-1 14.3 4.06e- 44 7.07e+0 9.31e+0
5 Europe continent… 1.75e+1 6.25e-1 28.0 6.34e-142 1.62e+1 1.87e+1
6 Oceania continent… 1.81e+1 1.78e+0 10.1 1.59e- 23 1.46e+1 2.16e+1
Call:
lm(formula = lifeExp ~ log(gdpPercap), data = eu77)
Residuals:
Min 1Q Median 3Q Max
-7.4956 -1.0306 0.0935 1.1755 3.7125
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 29.489 7.161 4.118 0.000306 ***
log(gdpPercap) 4.488 0.756 5.936 2.17e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.114 on 28 degrees of freedom
Multiple R-squared: 0.5572, Adjusted R-squared: 0.5414
F-statistic: 35.24 on 1 and 28 DF, p-value: 2.173e-06
# A tibble: 60 × 3
# Groups: continent, year [60]
continent year data
<fct> <int> <list>
1 Asia 1952 <tibble [33 × 4]>
2 Asia 1957 <tibble [33 × 4]>
3 Asia 1962 <tibble [33 × 4]>
4 Asia 1967 <tibble [33 × 4]>
5 Asia 1972 <tibble [33 × 4]>
6 Asia 1977 <tibble [33 × 4]>
7 Asia 1982 <tibble [33 × 4]>
8 Asia 1987 <tibble [33 × 4]>
9 Asia 1992 <tibble [33 × 4]>
10 Asia 1997 <tibble [33 × 4]>
# ℹ 50 more rows
Think of nesting as a kind of “super-grouping”. Look in the object inspector.
It’s still in there.
# A tibble: 30 × 6
# Groups: continent, year [1]
continent year country lifeExp pop gdpPercap
<fct> <int> <fct> <dbl> <int> <dbl>
1 Europe 1977 Albania 68.9 2509048 3533.
2 Europe 1977 Austria 72.2 7568430 19749.
3 Europe 1977 Belgium 72.8 9821800 19118.
4 Europe 1977 Bosnia and Herzegovina 69.9 4086000 3528.
5 Europe 1977 Bulgaria 70.8 8797022 7612.
6 Europe 1977 Croatia 70.6 4318673 11305.
7 Europe 1977 Czech Republic 70.7 10161915 14800.
8 Europe 1977 Denmark 74.7 5088419 20423.
9 Europe 1977 Finland 72.5 4738902 15605.
10 Europe 1977 France 73.8 53165019 18293.
# ℹ 20 more rows
Here we map()
a custom function to every row in the data
column.
# A tibble: 60 × 4
# Groups: continent, year [60]
continent year data model
<fct> <int> <list> <list>
1 Asia 1952 <tibble [33 × 4]> <lm>
2 Asia 1957 <tibble [33 × 4]> <lm>
3 Asia 1962 <tibble [33 × 4]> <lm>
4 Asia 1967 <tibble [33 × 4]> <lm>
5 Asia 1972 <tibble [33 × 4]> <lm>
6 Asia 1977 <tibble [33 × 4]> <lm>
7 Asia 1982 <tibble [33 × 4]> <lm>
8 Asia 1987 <tibble [33 × 4]> <lm>
9 Asia 1992 <tibble [33 × 4]> <lm>
10 Asia 1997 <tibble [33 × 4]> <lm>
# ℹ 50 more rows
We can tidy the nested models, too.
# A tibble: 48 × 9
# Groups: continent, year [48]
continent year data model term estimate std.error statistic p.value
<fct> <int> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
1 Asia 1952 <tibble> <lm> log(gdp… 4.16 1.25 3.33 2.28e-3
2 Asia 1957 <tibble> <lm> log(gdp… 4.17 1.28 3.26 2.71e-3
3 Asia 1962 <tibble> <lm> log(gdp… 4.59 1.24 3.72 7.94e-4
4 Asia 1967 <tibble> <lm> log(gdp… 4.50 1.15 3.90 4.77e-4
5 Asia 1972 <tibble> <lm> log(gdp… 4.44 1.01 4.41 1.16e-4
6 Asia 1977 <tibble> <lm> log(gdp… 4.87 1.03 4.75 4.42e-5
7 Asia 1982 <tibble> <lm> log(gdp… 4.78 0.852 5.61 3.77e-6
8 Asia 1987 <tibble> <lm> log(gdp… 5.17 0.727 7.12 5.31e-8
9 Asia 1992 <tibble> <lm> log(gdp… 5.09 0.649 7.84 7.60e-9
10 Asia 1997 <tibble> <lm> log(gdp… 5.11 0.628 8.15 3.35e-9
# ℹ 38 more rows
# A tibble: 5 × 9
continent year data model term estimate std.error statistic p.value
<fct> <int> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
1 Africa 1987 <tibble> <lm> log(gdpP… 6.48 0.898 7.22 2.75e-9
2 Asia 1992 <tibble> <lm> log(gdpP… 5.09 0.649 7.84 7.60e-9
3 Africa 1982 <tibble> <lm> log(gdpP… 5.61 0.898 6.25 8.79e-8
4 Africa 1972 <tibble> <lm> log(gdpP… 3.80 0.962 3.95 2.44e-4
5 Asia 1972 <tibble> <lm> log(gdpP… 4.44 1.01 4.41 1.16e-4
# A tibble: 437 × 28
PID county state area poptotal popdensity popwhite popblack popamerindian
<int> <chr> <chr> <dbl> <int> <dbl> <int> <int> <int>
1 561 ADAMS IL 0.052 66090 1271. 63917 1702 98
2 562 ALEXAN… IL 0.014 10626 759 7054 3496 19
3 563 BOND IL 0.022 14991 681. 14477 429 35
4 564 BOONE IL 0.017 30806 1812. 29344 127 46
5 565 BROWN IL 0.018 5836 324. 5264 547 14
6 566 BUREAU IL 0.05 35688 714. 35157 50 65
7 567 CALHOUN IL 0.017 5322 313. 5298 1 8
8 568 CARROLL IL 0.027 16805 622. 16519 111 30
9 569 CASS IL 0.024 13437 560. 13384 16 8
10 570 CHAMPA… IL 0.058 173025 2983. 146506 16559 331
# ℹ 427 more rows
# ℹ 19 more variables: popasian <int>, popother <int>, percwhite <dbl>,
# percblack <dbl>, percamerindan <dbl>, percasian <dbl>, percother <dbl>,
# popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
# poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
# percchildbelowpovert <dbl>, percadultpoverty <dbl>,
# percelderlypoverty <dbl>, inmetro <int>, category <chr>
# A tibble: 437 × 25
# Groups: state [5]
state area poptotal popdensity popwhite popblack popamerindian popasian
<chr> <dbl> <int> <dbl> <int> <int> <int> <int>
1 IL 0.052 66090 1271. 63917 1702 98 249
2 IL 0.014 10626 759 7054 3496 19 48
3 IL 0.022 14991 681. 14477 429 35 16
4 IL 0.017 30806 1812. 29344 127 46 150
5 IL 0.018 5836 324. 5264 547 14 5
6 IL 0.05 35688 714. 35157 50 65 195
7 IL 0.017 5322 313. 5298 1 8 15
8 IL 0.027 16805 622. 16519 111 30 61
9 IL 0.024 13437 560. 13384 16 8 23
10 IL 0.058 173025 2983. 146506 16559 331 8033
# ℹ 427 more rows
# ℹ 17 more variables: popother <int>, percwhite <dbl>, percblack <dbl>,
# percamerindan <dbl>, percasian <dbl>, percother <dbl>, popadults <int>,
# perchsd <dbl>, percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
# percpovertyknown <dbl>, percbelowpoverty <dbl>, percchildbelowpovert <dbl>,
# percadultpoverty <dbl>, percelderlypoverty <dbl>, inmetro <int>
We could do this anonymously too.
Standard deviations (1, .., p=24):
[1] 3.10e+00 2.21e+00 1.65e+00 1.19e+00 1.12e+00 8.98e-01 8.86e-01 8.19e-01
[9] 6.92e-01 5.65e-01 5.44e-01 4.85e-01 3.80e-01 3.58e-01 3.09e-01 2.50e-01
[17] 2.09e-01 1.92e-01 9.65e-02 3.47e-02 1.33e-02 3.86e-03 2.89e-09 9.98e-16
Rotation (n x k) = (24 x 24):
PC1 PC2 PC3 PC4 PC5 PC6
area -0.02619 0.0150 -0.10362 0.2508 -0.626804 0.48689
poptotal -0.31299 0.0515 0.11077 0.0544 0.002497 0.04058
popdensity -0.29205 0.0236 0.04558 -0.1042 0.139053 0.15186
popwhite -0.31410 0.0238 0.08117 0.0278 0.016318 0.07259
popblack -0.28773 0.1074 0.14902 0.0598 0.006193 0.04880
popamerindian -0.26919 0.0916 -0.01213 -0.1236 -0.196022 0.14136
popasian -0.28406 0.0578 0.14522 0.2132 -0.083397 -0.16286
popother -0.25831 0.0807 0.19651 0.2163 -0.110491 -0.26225
percwhite 0.18251 -0.1756 0.25486 0.4371 0.082341 0.07589
percblack -0.19880 0.1119 -0.15319 -0.2542 0.343979 0.25296
percamerindan -0.00153 0.1745 -0.18427 -0.4065 -0.522049 -0.28835
percasian -0.19122 -0.1272 -0.33296 0.2078 0.057549 -0.06347
percother -0.17367 -0.0506 0.03017 -0.0944 -0.021881 -0.58380
popadults -0.31212 0.0529 0.11638 0.0545 0.000792 0.04190
perchsd -0.09422 -0.3559 -0.18125 -0.0433 -0.185756 0.03657
percollege -0.15307 -0.2554 -0.34484 0.0812 -0.024319 0.08217
percprof -0.14308 -0.2065 -0.39278 0.1601 0.115989 -0.03980
poppovertyknown -0.31259 0.0522 0.11422 0.0527 0.001560 0.04174
percpovertyknown 0.01603 0.0074 0.35975 -0.3495 -0.078744 0.25725
percbelowpoverty 0.02186 0.4029 -0.23735 0.0821 0.044412 -0.00326
percchildbelowpovert 0.00948 0.4121 -0.15012 -0.0412 0.051210 0.07182
percadultpoverty 0.01409 0.3590 -0.30902 0.1546 0.035637 -0.03790
percelderlypoverty 0.08536 0.3681 0.00766 0.1855 0.141928 0.08282
inmetro -0.13767 -0.1924 -0.12507 -0.3310 0.218386 0.15729
PC7 PC8 PC9 PC10 PC11 PC12
area -0.45357 0.107932 0.1266 -0.09418 -0.1123 0.0431
poptotal 0.04886 -0.028501 0.0155 0.00878 0.0179 0.0150
popdensity 0.06936 -0.066500 -0.1390 0.34249 -0.1779 -0.0595
popwhite 0.05617 -0.000858 0.0127 0.05245 -0.0656 0.0344
popblack 0.01683 -0.122024 -0.0269 -0.00446 0.1794 -0.0630
popamerindian 0.05258 -0.134705 -0.0411 0.44923 -0.1095 -0.3797
popasian 0.12879 0.047243 0.1011 -0.23737 0.0263 0.1368
popother 0.03907 0.052661 0.1772 -0.30565 0.2244 0.0863
percwhite 0.13000 0.159783 0.1321 0.28485 0.0554 -0.0947
percblack -0.38305 -0.177172 -0.2160 -0.30716 0.1082 0.1591
percamerindan 0.32453 -0.146125 0.0645 -0.12749 -0.1147 -0.0439
percasian 0.11659 0.180479 -0.0714 -0.01768 -0.5930 0.3612
percother -0.59580 0.390170 -0.0894 0.17667 -0.0671 -0.2075
popadults 0.05210 -0.029834 0.0164 0.00544 0.0206 0.0206
perchsd 0.04100 0.039628 -0.1430 0.02198 0.5422 -0.0818
percollege 0.12256 0.181171 -0.1849 -0.08584 0.1542 -0.2181
percprof 0.19379 0.101079 -0.1270 -0.13267 0.0155 -0.1796
poppovertyknown 0.04962 -0.027370 0.0146 0.00836 0.0192 0.0147
percpovertyknown 0.23760 0.682652 -0.2952 -0.13789 -0.0294 0.0681
percbelowpoverty 0.05392 0.182119 0.0472 0.11013 0.1439 0.0262
percchildbelowpovert -0.00510 0.224482 0.0394 0.19609 0.2277 0.2252
percadultpoverty 0.07611 0.179307 0.0725 0.20097 0.1930 0.0739
percelderlypoverty 0.02486 0.074240 -0.0498 -0.39718 -0.2045 -0.6657
inmetro -0.00301 0.220252 0.8179 -0.05357 -0.0222 -0.1318
PC13 PC14 PC15 PC16 PC17 PC18
area 1.49e-01 0.04925 -0.04158 0.06918 -0.08005 -0.03174
poptotal 8.09e-02 0.09421 -0.04704 -0.01902 0.22285 0.08703
popdensity 1.51e-01 0.47017 0.07373 0.13787 -0.57401 -0.28223
popwhite 1.03e-01 0.22760 0.15543 0.12548 0.46636 0.07769
popblack 8.92e-02 -0.16452 -0.64727 -0.42151 -0.25573 0.19708
popamerindian -3.34e-01 -0.55797 0.17430 0.08862 0.04284 -0.03870
popasian -1.24e-01 -0.01828 0.23649 0.12741 -0.09705 -0.03676
popother -1.07e-01 -0.20652 0.19229 0.06486 -0.31519 -0.25803
percwhite 1.61e-02 0.00537 0.00696 -0.00619 -0.00690 0.04870
percblack -9.25e-02 -0.14098 0.06451 0.05433 0.02579 -0.12478
percamerindan 1.25e-01 0.14325 -0.05045 -0.03463 -0.01622 0.04164
percasian -4.17e-01 -0.01410 -0.17323 -0.11467 -0.05198 0.10725
percother 6.38e-02 0.05381 -0.05133 -0.00728 0.02705 0.04714
popadults 8.47e-02 0.11195 -0.02766 -0.00765 0.22575 0.08734
perchsd -5.16e-01 0.32791 -0.18139 0.23073 -0.00181 0.09494
percollege 1.65e-01 0.01746 0.41803 -0.64004 -0.00284 -0.07362
percprof 4.73e-01 -0.29347 -0.16077 0.51738 -0.08001 0.13539
poppovertyknown 7.98e-02 0.09550 -0.04780 -0.02371 0.22682 0.08897
percpovertyknown -8.29e-05 -0.13054 -0.09925 0.04744 0.01969 -0.10975
percbelowpoverty -2.61e-02 0.04201 -0.05359 0.00659 0.04648 -0.06958
percchildbelowpovert 5.67e-03 0.00225 0.28393 0.03067 -0.23656 0.62012
percadultpoverty 4.30e-03 0.01308 -0.22556 -0.01478 0.21660 -0.54943
percelderlypoverty -2.38e-01 0.23711 -0.03068 0.03817 -0.02334 0.11150
inmetro -3.86e-02 0.01112 -0.05590 -0.02538 -0.04174 -0.00287
PC19 PC20 PC21 PC22 PC23 PC24
area 0.008671 0.004681 0.00061 8.37e-04 9.26e-11 4.00e-16
poptotal -0.076193 0.023054 0.26237 -2.70e-01 -1.53e-08 -8.10e-01
popdensity -0.005458 0.011704 0.00671 3.68e-03 2.87e-10 6.21e-16
popwhite -0.140728 0.030297 0.34182 -3.43e-01 2.85e-09 5.44e-01
popblack 0.104823 0.008645 0.11364 -1.40e-01 3.04e-10 2.14e-01
popamerindian 0.021665 0.004648 -0.00781 -2.08e-03 -6.54e-11 2.36e-03
popasian 0.781087 0.034020 0.01827 -1.95e-02 2.09e-10 2.58e-02
popother -0.554636 -0.010890 0.03550 -2.63e-02 1.25e-09 5.03e-02
percwhite -0.007309 0.015581 -0.00107 -1.13e-04 7.15e-01 -1.00e-08
percblack 0.034614 0.004477 -0.00140 3.42e-04 5.18e-01 -7.27e-09
percamerindan -0.019199 -0.026849 0.00482 -2.22e-04 4.58e-01 -6.42e-09
percasian -0.127336 -0.015781 -0.00556 8.87e-04 6.33e-02 -8.89e-10
percother 0.049001 -0.002061 -0.00432 -6.03e-04 8.45e-02 -1.19e-09
popadults -0.076871 -0.091715 -0.87903 -1.03e-01 -1.77e-08 8.42e-17
perchsd -0.015055 0.001811 0.00211 -1.11e-04 2.27e-10 1.13e-16
percollege 0.019522 0.007993 -0.00197 -1.66e-03 -7.27e-11 6.99e-17
percprof -0.005332 -0.006801 -0.00209 3.81e-03 -1.66e-10 -5.42e-17
poppovertyknown -0.069062 0.003553 0.13036 8.82e-01 2.86e-08 7.82e-15
percpovertyknown 0.000399 0.000865 -0.00122 -2.70e-03 -2.29e-10 2.72e-16
percbelowpoverty -0.017411 0.825060 -0.08306 4.75e-03 4.43e-09 8.59e-16
percchildbelowpovert -0.051527 -0.281404 0.03353 -7.31e-04 -8.29e-10 -5.22e-16
percadultpoverty 0.090370 -0.461037 0.04164 -2.95e-03 -2.97e-09 -4.43e-16
percelderlypoverty -0.036945 -0.121558 0.01264 -1.06e-03 -6.50e-10 -5.46e-17
inmetro 0.032697 0.000802 -0.00267 -5.15e-05 -9.24e-11 8.91e-17
# A tibble: 24 × 4
PC std.dev percent cumulative
<dbl> <dbl> <dbl> <dbl>
1 1 3.10 0.400 0.400
2 2 2.21 0.203 0.603
3 3 1.65 0.113 0.717
4 4 1.19 0.0593 0.776
5 5 1.12 0.0524 0.829
6 6 0.898 0.0336 0.862
7 7 0.886 0.0327 0.895
8 8 0.819 0.0280 0.923
9 9 0.692 0.0200 0.943
10 10 0.565 0.0133 0.956
# ℹ 14 more rows
augment()
This puts the original data points back in.
aug_pca <- augment(out_pca, data = mw_pca[,-1])
aug_pca <- aug_pca |>
tibble::add_column(midwest$state, midwest$county, .before = TRUE) |>
rename(state = `midwest$state`, county = `midwest$county`)
aug_pca
# A tibble: 437 × 51
state county .rownames area poptotal popdensity popwhite popblack
<chr> <chr> <chr> <dbl> <int> <dbl> <int> <int>
1 IL ADAMS 1 0.052 66090 1271. 63917 1702
2 IL ALEXANDER 2 0.014 10626 759 7054 3496
3 IL BOND 3 0.022 14991 681. 14477 429
4 IL BOONE 4 0.017 30806 1812. 29344 127
5 IL BROWN 5 0.018 5836 324. 5264 547
6 IL BUREAU 6 0.05 35688 714. 35157 50
7 IL CALHOUN 7 0.017 5322 313. 5298 1
8 IL CARROLL 8 0.027 16805 622. 16519 111
9 IL CASS 9 0.024 13437 560. 13384 16
10 IL CHAMPAIGN 10 0.058 173025 2983. 146506 16559
# ℹ 427 more rows
# ℹ 43 more variables: popamerindian <int>, popasian <int>, popother <int>,
# percwhite <dbl>, percblack <dbl>, percamerindan <dbl>, percasian <dbl>,
# percother <dbl>, popadults <int>, perchsd <dbl>, percollege <dbl>,
# percprof <dbl>, poppovertyknown <int>, percpovertyknown <dbl>,
# percbelowpoverty <dbl>, percchildbelowpovert <dbl>, percadultpoverty <dbl>,
# percelderlypoverty <dbl>, inmetro <int>, .fittedPC1 <dbl>, …
Let’s do that grouped by State
# A tibble: 5 × 3
# Groups: state [5]
state data pca
<chr> <list> <list>
1 IL <tibble [102 × 24]> <prcomp>
2 IN <tibble [92 × 24]> <prcomp>
3 MI <tibble [83 × 24]> <prcomp>
4 OH <tibble [88 × 24]> <prcomp>
5 WI <tibble [72 × 24]> <prcomp>
Alternatively, write a lambda function:
# A tibble: 5 × 3
# Groups: state [5]
state data pca
<chr> <list> <list>
1 IL <tibble [102 × 24]> <prcomp>
2 IN <tibble [92 × 24]> <prcomp>
3 MI <tibble [83 × 24]> <prcomp>
4 OH <tibble [88 × 24]> <prcomp>
5 WI <tibble [72 × 24]> <prcomp>
do_tidy <- function(pr){
broom::tidy(pr, matrix = "pcs")
}
state_pca <- mw_pca |>
mutate(pca = map(data, do_pca),
pcs = map(pca, do_tidy))
state_pca
# A tibble: 5 × 4
# Groups: state [5]
state data pca pcs
<chr> <list> <list> <list>
1 IL <tibble [102 × 24]> <prcomp> <tibble [24 × 4]>
2 IN <tibble [92 × 24]> <prcomp> <tibble [24 × 4]>
3 MI <tibble [83 × 24]> <prcomp> <tibble [24 × 4]>
4 OH <tibble [88 × 24]> <prcomp> <tibble [24 × 4]>
5 WI <tibble [72 × 24]> <prcomp> <tibble [24 × 4]>
Or again, write a lambda function
mw_pca |>
mutate(pca = map(data, \(x) prcomp(x, center = TRUE, scale = TRUE)),
pcs = map(pca, \(x) tidy(x, matrix = "pcs")))
# A tibble: 5 × 4
# Groups: state [5]
state data pca pcs
<chr> <list> <list> <list>
1 IL <tibble [102 × 24]> <prcomp> <tibble [24 × 4]>
2 IN <tibble [92 × 24]> <prcomp> <tibble [24 × 4]>
3 MI <tibble [83 × 24]> <prcomp> <tibble [24 × 4]>
4 OH <tibble [88 × 24]> <prcomp> <tibble [24 × 4]>
5 WI <tibble [72 × 24]> <prcomp> <tibble [24 × 4]>
mw_pca |>
mutate(pca = map(data, \(x) prcomp(x, center = TRUE, scale = TRUE)),
pcs = map(pca, \(x) tidy(x, matrix = "pcs"))) |>
unnest(cols = c(pcs)) |>
ggplot(aes(x = PC, y = percent)) +
geom_line(linewidth = 1.1) +
facet_wrap(~ state, nrow = 1) +
labs(x = "Principal Component",
y = "Variance Explained")
augment()
do_aug <- function(pr){
broom::augment(pr)
}
state_pca <- mw_pca |>
mutate(pca = map(data, do_pca),
pcs = map(pca, do_tidy),
fitted = map(pca, do_aug))
state_pca
# A tibble: 5 × 5
# Groups: state [5]
state data pca pcs fitted
<chr> <list> <list> <list> <list>
1 IL <tibble [102 × 24]> <prcomp> <tibble [24 × 4]> <tibble [102 × 25]>
2 IN <tibble [92 × 24]> <prcomp> <tibble [24 × 4]> <tibble [92 × 25]>
3 MI <tibble [83 × 24]> <prcomp> <tibble [24 × 4]> <tibble [83 × 25]>
4 OH <tibble [88 × 24]> <prcomp> <tibble [24 × 4]> <tibble [88 × 25]>
5 WI <tibble [72 × 24]> <prcomp> <tibble [24 × 4]> <tibble [72 × 25]>
mw_pca |>
mutate(pca = map(data, \(x) prcomp(x, center = TRUE, scale = TRUE)),
pcs = map(pca, \(x) tidy(x, matrix = "pcs")),
fitted = map(pca, \(x) augment(x)))
# A tibble: 5 × 5
# Groups: state [5]
state data pca pcs fitted
<chr> <list> <list> <list> <list>
1 IL <tibble [102 × 24]> <prcomp> <tibble [24 × 4]> <tibble [102 × 25]>
2 IN <tibble [92 × 24]> <prcomp> <tibble [24 × 4]> <tibble [92 × 25]>
3 MI <tibble [83 × 24]> <prcomp> <tibble [24 × 4]> <tibble [83 × 25]>
4 OH <tibble [88 × 24]> <prcomp> <tibble [24 × 4]> <tibble [88 × 25]>
5 WI <tibble [72 × 24]> <prcomp> <tibble [24 × 4]> <tibble [72 × 25]>
out <- midwest |>
group_by(state) |>
select_if(is.numeric) |>
select(-PID) |>
nest() |>
mutate(pca = map(data, \(x) prcomp(x, center = TRUE, scale = TRUE)),
pcs = map(pca, \(x) tidy(x, matrix = "pcs")),
fitted = map(pca, \(x) augment(x))) |>
unnest(cols = c(fitted)) |>
add_column(county = midwest$county)
out
# A tibble: 437 × 30
# Groups: state [5]
state data pca pcs .rownames .fittedPC1 .fittedPC2 .fittedPC3
<chr> <list> <list> <list> <chr> <dbl> <dbl> <dbl>
1 IL <tibble> <prcomp> <tibble> 1 0.403 -0.233 0.00488
2 IL <tibble> <prcomp> <tibble> 2 0.413 8.14 3.33
3 IL <tibble> <prcomp> <tibble> 3 0.908 0.326 -0.303
4 IL <tibble> <prcomp> <tibble> 4 -0.344 -2.37 -0.907
5 IL <tibble> <prcomp> <tibble> 5 0.940 1.61 1.06
6 IL <tibble> <prcomp> <tibble> 6 0.482 -1.07 -0.744
7 IL <tibble> <prcomp> <tibble> 7 1.61 1.83 -1.29
8 IL <tibble> <prcomp> <tibble> 8 0.835 -0.428 -0.839
9 IL <tibble> <prcomp> <tibble> 9 1.28 0.408 -1.08
10 IL <tibble> <prcomp> <tibble> 10 -3.49 -3.13 7.39
# ℹ 427 more rows
# ℹ 22 more variables: .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
# .fittedPC7 <dbl>, .fittedPC8 <dbl>, .fittedPC9 <dbl>, .fittedPC10 <dbl>,
# .fittedPC11 <dbl>, .fittedPC12 <dbl>, .fittedPC13 <dbl>, .fittedPC14 <dbl>,
# .fittedPC15 <dbl>, .fittedPC16 <dbl>, .fittedPC17 <dbl>, .fittedPC18 <dbl>,
# .fittedPC19 <dbl>, .fittedPC20 <dbl>, .fittedPC21 <dbl>, .fittedPC22 <dbl>,
# .fittedPC23 <dbl>, .fittedPC24 <dbl>, county <chr>
'cimg' num [1:800, 1:633, 1, 1] 0.914 0.929 0.91 0.906 0.898 ...
[1] 800 633 1 1
x y value
1 1 1 0.914
2 2 1 0.929
3 3 1 0.910
4 4 1 0.906
5 5 1 0.898
6 6 1 0.886
[1] 800 634
# A tibble: 5 × 5
x `1` `2` `3` `4`
<int> <dbl> <dbl> <dbl> <dbl>
1 1 0.914 0.914 0.914 0.910
2 2 0.929 0.929 0.925 0.918
3 3 0.910 0.910 0.902 0.894
4 4 0.906 0.902 0.898 0.894
5 5 0.898 0.894 0.890 0.886
[1] 800 633
# A tibble: 5 × 5
`1` `2` `3` `4` `5`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.914 0.914 0.914 0.910 0.902
2 0.929 0.929 0.925 0.918 0.910
3 0.910 0.910 0.902 0.894 0.886
4 0.906 0.902 0.898 0.894 0.890
5 0.898 0.894 0.890 0.886 0.882
1 2 3 4 5
[1,] 0.306 0.327 0.335 0.314 0.266
[2,] 0.417 0.433 0.412 0.365 0.317
[3,] 0.278 0.301 0.258 0.212 0.166
[4,] 0.251 0.248 0.232 0.212 0.191
[5,] 0.195 0.195 0.181 0.161 0.141
[1] 633 633
1 2 3 4 5
1 1.000 0.995 0.988 0.985 0.982
2 0.995 1.000 0.997 0.990 0.983
3 0.988 0.997 1.000 0.996 0.987
4 0.985 0.990 0.996 1.000 0.993
5 0.982 0.983 0.987 0.993 1.000
Doing the PCA manually
# Decomposition/Factorization into eigenvalues and eigenvectors
cov_eig <- eigen(cov_mat)
names(cov_eig)
[1] "values" "vectors"
[1] 231.4 120.6 56.9 31.1 22.8
[,1] [,2] [,3] [,4] [,5]
[1,] -0.00616 -0.0657 0.0288 -0.0393 0.0601
[2,] -0.00673 -0.0661 0.0286 -0.0385 0.0590
[3,] -0.00715 -0.0659 0.0274 -0.0389 0.0585
[4,] -0.00747 -0.0660 0.0267 -0.0383 0.0594
[5,] -0.00648 -0.0661 0.0279 -0.0399 0.0606
# Rotation matrix -- ie the coordinates of the
# original data points translated into the
# transformed coordinate space prcomp$rotation
tmp_rot <- tmp_norm %*% cov_evecs
dim(tmp_rot)
[1] 800 633
[,1] [,2] [,3] [,4] [,5]
[1,] 11.59 -4.48 -2.27 3.67 4.66
[2,] 10.92 -4.39 -3.48 4.24 6.42
[3,] 9.41 -4.24 -4.51 3.98 6.59
[4,] 8.77 -3.95 -4.65 3.40 4.50
[5,] 8.45 -3.15 -4.49 2.57 1.61
[,1] [,2] [,3] [,4] [,5]
[1,] 0 0 0 0 0
[2,] 0 0 0 0 0
[3,] 0 0 0 0 0
[4,] 0 0 0 0 0
[5,] 0 0 0 0 0
img_pca <- img_df |>
select(-x) |>
prcomp(scale = TRUE, center = TRUE)
pca_tidy <- tidy(img_pca, matrix = "pcs")
pca_tidy
# A tibble: 633 × 4
PC std.dev percent cumulative
<dbl> <dbl> <dbl> <dbl>
1 1 15.2 0.366 0.366
2 2 11.0 0.191 0.556
3 3 7.54 0.0899 0.646
4 4 5.57 0.0491 0.695
5 5 4.78 0.0361 0.731
6 6 4.56 0.0328 0.764
7 7 4.06 0.0261 0.790
8 8 3.66 0.0212 0.811
9 9 3.37 0.0179 0.829
10 10 3.28 0.0170 0.846
# ℹ 623 more rows
reverse_pca <- function(n_comp = 20, pca_object = img_pca){
## The pca_object is an object created by base R's prcomp() function.
## Multiply the matrix of rotated data by the transpose of the matrix
## of eigenvalues (i.e. the component loadings) to get back to a
## matrix of original data values
recon <- pca_object$x[, 1:n_comp] %*% t(pca_object$rotation[, 1:n_comp])
## Reverse any scaling and centering that was done by prcomp()
if(all(pca_object$scale != FALSE)){
## Rescale by the reciprocal of the scaling factor, i.e. back to
## original range.
recon <- scale(recon, center = FALSE, scale = 1/pca_object$scale)
}
if(all(pca_object$center != FALSE)){
## Remove any mean centering by adding the subtracted mean back in
recon <- scale(recon, scale = FALSE, center = -1 * pca_object$center)
}
## Make it a data frame that we can easily pivot to long format
## (because that's the format that the excellent imager library wants
## when drawing image plots with ggplot)
recon_df <- data.frame(cbind(1:nrow(recon), recon))
colnames(recon_df) <- c("x", 1:(ncol(recon_df)-1))
## Return the data to long form
recon_df_long <- recon_df |>
tidyr::pivot_longer(cols = -x,
names_to = "y",
values_to = "value") |>
mutate(y = as.numeric(y)) |>
arrange(y) |>
as.data.frame()
tibble::as_tibble(recon_df_long)
}
## The sequence of PCA components we want
n_pcs <- c(2:5, 10, 20, 50, 100)
names(n_pcs) <- paste("First", n_pcs, "Components", sep = "_")
## Map reverse_pca()
recovered_imgs <- map(n_pcs, reverse_pca) |>
list_rbind(names_to = "pcs") |>
mutate(pcs = str_replace_all(pcs, "_", " "),
pcs = factor(pcs, levels = unique(pcs), ordered = TRUE))
recovered_imgs
# A tibble: 4,051,200 × 4
pcs x y value
<ord> <dbl> <dbl> <dbl>
1 First 2 Components 1 1 0.902
2 First 2 Components 2 1 0.902
3 First 2 Components 3 1 0.902
4 First 2 Components 4 1 0.899
5 First 2 Components 5 1 0.892
6 First 2 Components 6 1 0.886
7 First 2 Components 7 1 0.877
8 First 2 Components 8 1 0.858
9 First 2 Components 9 1 0.823
10 First 2 Components 10 1 0.772
# ℹ 4,051,190 more rows
p <- ggplot(data = recovered_imgs,
mapping = aes(x = x, y = y, fill = value))
p_out <- p + geom_raster() +
scale_y_reverse() +
scale_fill_gradient(low = "black", high = "white") +
facet_wrap(~ pcs, ncol = 4) +
guides(fill = FALSE) +
labs(title = "Recovering the content of an 800x600 pixel image\nfrom a Principal Components Analysis of its pixels") +
theme(strip.text = element_text(face = "bold", size = rel(1.2)),
plot.title = element_text(size = rel(1.5)))