U.S. Post Office Data Visualization

Thu, Mar 24, 2022 4-minute read

In this blog post, I will analyze the post offices in the U.S. in terms of when they were established and when they were discontinued.

library(tidyverse)
library(geofacet)
library(tidytext)
theme_set(theme_bw())
post_offices <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-13/post_offices.csv') %>%
  filter(longitude < 100,
         established > 1000,
         established < discontinued,
         discontinued < 2020) %>%
  select(-contains("gnis"), -alt_name, -orig_name) %>%
  mutate(name = str_to_title(name),
         est_decade = 10 * (established %/% 10),
         dis_decade = 10 * (discontinued %/% 10)) 

post_offices 
## # A tibble: 82,173 x 17
##    name      state county1  county2 county3 orig_county established discontinued
##    <chr>     <chr> <chr>    <lgl>   <lgl>   <chr>             <dbl>        <dbl>
##  1 Aaron     MO    BATES    NA      NA      Bates              1895         1933
##  2 Aaron     GA    BULLOCH  NA      NA      Bulloch            1909         1920
##  3 Aaron     SC    ANDERSON NA      NA      Anderson           1892         1899
##  4 Abadyl    MO    CHRISTI~ NA      NA      Christian          1895         1919
##  5 Abattis   MO    WARREN   NA      NA      Warren             1878         1904
##  6 Abaugh    AR    NEWTON   NA      NA      Newton             1928         1954
##  7 Abba      GA    IRWIN    NA      NA      Irwin              1884         1954
##  8 Abbey     AL    FRANKLIN NA      NA      Franklin           1898         1902
##  9 Abbie Joe LA    BEAUREG~ NA      NA      Beauregard         1918         1925
## 10 Abbot     ME    PISCATA~ NA      NA      Piscataqui~        1825         1912
## # ... with 82,163 more rows, and 9 more variables: continuous <lgl>,
## #   stamp_index <dbl>, id <dbl>, coordinates <lgl>, duration <dbl>,
## #   latitude <dbl>, longitude <dbl>, est_decade <dbl>, dis_decade <dbl>

The post office map:

post_offices %>% 
  ggplot(aes(longitude, latitude, color = state)) +
  geom_point(show.legend = F) +
  theme_void() +
  labs(title = "The Locations of Post Offices")

Century-wise post office maps:

post_offices %>%
  mutate(century = 100 * (established %/% 100),
         century = paste("Century", century)) %>% 
  mutate(timeline = case_when(established < 1800 ~ "Before 1800",
                              established >= 1800 & established <= 1849 ~ "Between 1800 and 1849",
                              established >= 1850 & established <= 1899 ~ "Between 1850 and 1899",
                              established >= 1900 ~ "After 1900"),
         timeline = fct_reorder(timeline, parse_number(timeline))) %>% 
  ggplot(aes(longitude, latitude, color = state)) +
  geom_point(show.legend = F) +
  theme_void() +
  labs(title = "The Locations of Post Offices per Century") +
  theme(strip.text = element_text(size = 15),
        plot.title = element_text(size = 18)) +
  coord_fixed() +
  facet_wrap(~timeline, ncol = 2) 

The maps above show how America progressed on its area based on where the post offices were located.

Top 5 post offices with the longest duration:

post_offices %>%
  group_by(state) %>%
  slice_max(duration, n = 5, with_ties = F) %>%
  ungroup() %>%
  mutate(name = reorder_within(name, established, state)) %>%
  ggplot(aes(y = name, color = state)) +
  geom_errorbarh(aes(xmin = established, 
                     xmax = discontinued),
                 height = 0.2) +
  facet_geo(~state, scale = "free_y") +
  scale_y_reordered() +
  theme(axis.text.x = element_text(angle = 90),
        legend.position = "none") +
  labs(y = "post office name",
       title = "Top 5 Longest Used Post Offices per State")

Does duration have anything to do with the established decade?

post_offices  %>%
  ggplot(aes(est_decade, duration, group = est_decade, fill = factor(est_decade), color = factor(est_decade))) +
  geom_boxplot(alpha = 0.6,
               show.legend = F) +
  scale_x_continuous(breaks = seq(1750, 2000, 20)) +
  labs(x = "established decade",
       y = "duration (in years)",
       title = "How long did post office last per established decade?") 

There is a slight downward trend for the duration of the post offices established in the newer era.

post_offices %>%
  group_by(state) %>%
  slice_min(established, n = 1, with_ties = F) %>%
  ungroup() %>%
  mutate(name = paste0(name, "(", state, ")"),
         name = fct_reorder(name, established)) %>%
  ggplot(aes(established, name, fill = duration)) +
  geom_col() +
  geom_text(aes(label = established),
            hjust = 0) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x = element_blank()) +
  scale_fill_gradient2(low = "red",
                       mid = "pink",
                       high = "green") +
  expand_limits(x = 1940) +
  labs(x = "established year",
       y = "post office name and state",
       title = "The First Post Office and its Established Year per State")

The oldest post office was established in Maine in 1764.

Number of post office established and discontinued per decade:

post_offices %>%
  group_by(est_decade) %>%
  summarize(est_num = n()) %>%
  ungroup() %>%
  full_join(post_offices %>%
              group_by(dis_decade) %>%
              summarize(dis_num = n()) %>%
              ungroup(),
            by = c("est_decade" = "dis_decade")) %>%
  rename(`Established Number` = est_num,
         `Discontinued Number` = dis_num) %>%
  pivot_longer(2:3) %>%
  ggplot(aes(est_decade, value, color = name)) +
  geom_line() +
  geom_point() +
  labs(x = "decade",
       y = NULL,
       color = NULL,
       title = "# of Post Offices Established and Discontinued per Decade")

post_offices %>%
  group_by(established) %>%
  summarize(est_num = n()) %>%
  ungroup() %>%
  full_join(post_offices %>%
              group_by(discontinued) %>%
              summarize(dis_num = n()) %>%
              ungroup(),
            by = c("established" = "discontinued")) %>%
  rename(`Established Number` = est_num,
         `Discontinued Number` = dis_num) %>%
  pivot_longer(2:3) %>%
  ggplot(aes(established, value, color = name)) +
  geom_line() +
  geom_point() +
  labs(x = "year",
       y = NULL,
       color = NULL,
       title = "# of Post Offices Established and Discontinued per year")