U.S. Incarceration Data Visualization Analysis with Functional Programming

Mon, Sep 27, 2021 4-minute read

The datasets of this blog posts are from TidyTuesday about prison and pretrial in the U.S. in the last several decades.

library(tidyverse)
library(patchwork)
library(scales)
library(tidytext)
library(geofacet)
theme_set(theme_light())

Prison Summary

First off, let’s analyze the overall prison summary.

prison_summary <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-22/prison_summary.csv")

prison_summary

## # A tibble: 1,000 x 4
##     year urbanicity pop_category rate_per_100000
##    <dbl> <chr>      <chr>                  <dbl>
##  1  1983 rural      Black                 1117. 
##  2  1983 rural      Female                  22.8
##  3  1983 rural      Male                   574. 
##  4  1983 rural      Other                  315. 
##  5  1983 rural      Total                  248. 
##  6  1983 rural      White                  155. 
##  7  1983 small/mid  Black                 1138. 
##  8  1983 small/mid  Female                  26.6
##  9  1983 small/mid  Male                   550. 
## 10  1983 small/mid  Other                   87.5
## # ... with 990 more rows

Total prison rate among different regions

p1 <- prison_summary %>% 
  filter(pop_category == "Total") %>%
  ggplot(aes(year, rate_per_100000, color = urbanicity)) +
  geom_line() +
  geom_point() +
  facet_wrap(~urbanicity) +
  theme(
    legend.position = "none",
    strip.text = element_text(size = 15, face = "bold")
  ) +
  scale_x_continuous(breaks = seq(1970, 2020, by = 5)) +
  labs(y = "rate per 100,000",
       title = "Yearly Total Prison Rate Among Different Regions")

Prison rate among men and women

p2 <- prison_summary %>%
  filter(pop_category %in% c("Male", "Female")) %>%
  mutate(pop_category = fct_reorder(pop_category, -rate_per_100000, sum)) %>%
  ggplot(aes(year, rate_per_100000, color = pop_category)) +
  geom_line() +
  geom_point() +
  facet_wrap(~urbanicity) +
  theme(
    strip.text = element_text(size = 15, face = "bold")
  ) +
  scale_x_continuous(breaks = seq(1970, 2020, by = 10)) +
  labs(y = "rate per 100,000",
       title = "Yearly Gender Prison Rate Among Different Regions",
       color = NULL)

Prison rate from different races

p3 <- prison_summary %>%
  filter(pop_category %in% c("Asian", "Black", "Latino", "Native American", "White")) %>%
  ggplot(aes(year, rate_per_100000, color = urbanicity)) +
  geom_line() +
  geom_point() +
  facet_wrap(~pop_category) +
  theme(
    strip.text = element_text(size = 15, face = "bold")
  ) +
  scale_x_continuous(breaks = seq(1970, 2020, by = 10)) +
  labs(y = "rate per 100,000",
       title = "Yearly Prison Rate from Various Races Among Different Regions",
       color = NULL)

Now using patchwork to bind the three plot objects together

p1 / p2 / p3

Prison Population

prison_pop <- read_csv("https://github.com/rfordatascience/tidytuesday/blob/master/data/2019/2019-01-22/prison_population.csv?raw=true")

skimr::skim(prison_pop)

Table 1: Data summary
Name	prison_pop
Number of rows	1327797
Number of columns	9
_______________________
Column type frequency:
character	6
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
state	1	2	2	51
county_name	1	10	33	1876
urbanicity	1	5	9	4
region	1	4	9	4
division	1	7	18	9
pop_category	1	4	15	9

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	1993.00	13.56	1970	1981	1993	2005	2016	▇▇▇▇▇
population	273093	0.79	23079.75	107980.43	0	204	3527	13279	6974673	▇▁▁▁▁
prison_population	751787	0.43	141.56	936.79	0	0	6	59	58091	▇▁▁▁▁

Prison population percentage in each decade in every state

Functional programming is used to genreate line plots in each state in every population cateogry presented in the dataset.

prison_pop_map <- function(metric){
  prison_pop %>%
  filter(pop_category == metric) %>%
  group_by(state, year) %>%
  mutate(state_pop_year = sum(population),
         prison_pop_year = sum(prison_population, na.rm = TRUE)) %>%
  ungroup() %>% 
  filter(prison_pop_year > 0) %>%
  mutate(prison_percentage = prison_pop_year/state_pop_year) %>%
  ggplot(aes(year, prison_percentage, color = state)) +
  geom_line(size = 1, show.legend = F) +
  facet_geo(~state) +
  scale_y_continuous(labels = percent) +
  labs(y = "yearly prison population percentage",
       title = paste("State-wise Prison Population Percentage among", metric),
       subtitle = "Missing values are prevalent in the dataset. A state without any line does not indicate it does not have any prisoner but rather due to missing values")
  
}

map(unique(prison_pop$pop_category), prison_pop_map)

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

Pretrial Summary

pre_summary <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-22/pretrial_summary.csv")


pre_summary %>%
  mutate(urbanicity = fct_reorder(urbanicity, -rate_per_100000, sum)) %>%
  ggplot(aes(year, rate_per_100000, color = urbanicity)) +
  geom_line(size = 1) +
  geom_point() +
  facet_wrap(~pop_category, ncol = 1) +
  labs(y = "rate per 100,000",
       color = NULL,
       title = "Pretrial Rates among Different Regions") +
  scale_x_continuous(breaks = seq(1970, 2020, 5)) +
  theme(
    strip.text = element_text(size = 15, face = "bold")
  )

Pretrial Population

pre_pop <- read_csv("https://github.com/rfordatascience/tidytuesday/blob/master/data/2019/2019-01-22/pretrial_population.csv?raw=true")


pre_pop_func <- function(category_metric){
  pre_pop %>%
  filter(pop_category == category_metric) %>%
  group_by(year, region) %>%
  summarize(pop = sum(population, na.rm = T),
            pre_pop = sum(pretrial_population, na.rm = T)) %>%
  ungroup() %>%
  mutate(pre_percentage = pre_pop/pop) %>%
  ggplot(aes(year, pre_percentage, color = region)) +
  geom_line(size = 1) +
  geom_point() +
  scale_y_continuous(labels = percent) +
  labs(y = "pretrial percentage",
       title = paste("Pretrial Percentage among", category_metric),
       subtitle = "Pretrial percentage is computed by using # of pretrial cases divides by total population in the region")
  
}

map(unique(pre_pop$pop_category)[1:3], pre_pop_func)

## [[1]]

## 
## [[2]]

## 
## [[3]]