Data Visualization on the Great American Beer Festival (Logistic Regression Used)

Sat, Feb 26, 2022 5-minute read

The data for this blog post is from TidyTuesday.

library(tidyverse)
library(geofacet)
library(tidytext)
library(tidylo)
library(scales)
library(broom)
theme_set(theme_light())

beer_awards <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-20/beer_awards.csv') %>%
  mutate(state = str_to_upper(state),
         medal = factor(medal, levels = c("Bronze", "Silver", "Gold")))

beer_awards

## # A tibble: 4,970 x 7
##    medal  beer_name                           brewery city  state category  year
##    <fct>  <chr>                               <chr>   <chr> <chr> <chr>    <dbl>
##  1 Gold   Volksbier Vienna                    Wibby ~ Long~ CO    America~  2020
##  2 Silver Oktoberfest                         Founde~ Gran~ MI    America~  2020
##  3 Bronze Amber Lager                         Skippi~ Stau~ VA    America~  2020
##  4 Gold   Lager at World's End                Epidem~ Conc~ CA    America~  2020
##  5 Silver Seismic Tremor                      Seismi~ Sant~ CA    America~  2020
##  6 Bronze Lite Thinking                       Pollya~ Lemo~ IL    America~  2020
##  7 Gold   Beachscape                          Ventur~ Vent~ CA    America~  2020
##  8 Silver Imagine a World with Beer Cellars ~ Freeta~ San ~ TX    America~  2020
##  9 Bronze Pilsner                             Old To~ Port~ OR    America~  2020
## 10 Gold   Tank 7                              Boulev~ Kans~ MO    America~  2020
## # ... with 4,960 more rows

Medal counts:

beer_awards %>%
  count(year, state, medal, sort = T) %>%
  ggplot(aes(year, n, color = medal)) +
  geom_line(size = 1) +
  facet_geo(~state) +
  theme(strip.text = element_text(size = 15),
        plot.title = element_text(size = 18)) +
  labs(y = "# of medals",
       title = "State-wise Medal Counts across Years")

Gold beers:

beer_awards %>%
  filter(medal == "Gold") %>%
  group_by(beer_name, state) %>%
  summarize(year_min = min(year),
            year_max = max(year),
            year_diff = year_max - year_min,
            n = n()) %>%
  arrange(desc(year_diff)) %>%
  filter(n > 2) %>%
  ungroup() %>%
  ggplot(aes(year_min, year_max, color = state)) +
  geom_point(aes(size = n)) +
  geom_text(aes(label = beer_name), check_overlap = T, hjust = 1, vjust = 1) +
  labs(x = "first year getting gold medal",
       y = "last year getting gold medal",
       size = "# of gold medals",
       title = "Gold-medal Beers with Respect to Year and State")

Top 8 beer categories:

beer_awards %>%
  count(category, medal, sort = T) %>%
  group_by(medal) %>%
  slice_max(n, n = 8, with_ties = F) %>%
  ungroup() %>%
  mutate(category = fct_reorder(category, n, sum)) %>%
  ggplot(aes(n, category, fill = medal)) +
  geom_col() +
  labs(x = "# of medals",
       y = "beer category",
       fill = NULL,
       title = "Top 8 Categories per Medal")

Which city is famous for beer and brewery?

beer_awards %>%
  count(state, city, medal, sort = T) %>%
  head(100) %>%
  mutate(city = paste0(city, " (", state, ")")) %>%
  mutate(city = reorder_within(city, n, medal)) %>%
  ggplot(aes(n, city, fill = medal)) +
  geom_col() +
  scale_y_reordered() +
  facet_wrap(~medal, scales = "free_y") +
  theme(legend.position = "none",
        strip.text = element_text(size = 15),
        plot.title = element_text(size = 18)) +
  labs(x = "# of awards",
       y = "city with state",
       title = "Which City is the Most Beer-awarded City?")

Top 10 Breweries:

beer_awards %>%
  filter(fct_lump(brewery, n = 10) != "Other") %>%
  count(brewery, medal, sort = T) %>%
  mutate(brewery = fct_reorder(brewery, n, sum)) %>%
  ggplot(aes(medal, brewery, fill = n)) +
  geom_tile() +
  scale_fill_gradient2(high = "green",
                       low = "red",
                       midpoint = 15) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_discrete(expand = c(0,0)) +
  labs(x = NULL,
       y = NULL,
       fill = "# of medals",
       title = "10 Most-awarded Breweries and Their Medal Counts") +
  theme(plot.title = element_text(size = 18),
        axis.text = element_text(size = 13))

beer_joined <- beer_awards %>%
  count(beer_name, medal, brewery, sort = T) %>%
  group_by(beer_name, brewery) %>%
  summarize(n = n()) %>%
  filter(n > 1) %>%
  ungroup() %>%
  select(-n) %>%
  inner_join(beer_awards,
             by = c("beer_name", "brewery")) 

beer_joined

## # A tibble: 1,003 x 7
##    beer_name               brewery              medal city  state category  year
##    <chr>                   <chr>                <fct> <chr> <chr> <chr>    <dbl>
##  1 2004 Triple Exultation  Eel River Brewing C~ Silv~ Fort~ CA    Aged Be~  2012
##  2 2004 Triple Exultation  Eel River Brewing C~ Bron~ Fort~ CA    Aged Be~  2011
##  3 5 Barrel Pale Ale       Odell Brewing Co.    Gold  Fort~ CO    Classic~  2013
##  4 5 Barrel Pale Ale       Odell Brewing Co.    Silv~ Fort~ CO    Classic~  2005
##  5 Abbey Belgian Style Ale New Belgium Brewing~ Bron~ Fort~ CO    Belgian~  2005
##  6 Abbey Belgian Style Ale New Belgium Brewing~ Gold  Fort~ CO    Belgian~  2004
##  7 Abbey Belgian Style Ale New Belgium Brewing~ Gold  Fort~ CO    Belgian~  2003
##  8 Abbey Belgian Style Ale New Belgium Brewing~ Bron~ Fort~ CO    Belgian~  2001
##  9 Abbey Belgian Style Ale New Belgium Brewing~ Gold  Fort~ CO    Belgian~  2000
## 10 Abbey Belgian Style Ale New Belgium Brewing~ Bron~ Fort~ CO    Belgian~  1998
## # ... with 993 more rows

Which beers fluctuated the medals?

beer_joined %>%
  filter(fct_lump(beer_name, n = 9) != "Other") %>%
  ggplot(aes(year, medal, color = beer_name, group = beer_name)) +
  geom_line(size = 1) +
  geom_point() +
  facet_wrap(~beer_name) +
  theme(legend.position = "none",
        strip.text = element_text(size = 11),
        plot.title = element_text(size = 18)) +
  labs(y = NULL,
       title = "Beers that Fluctuated among the Medals?")

The following code snippets are inspired by David Robinson. Here you can find his code.

Which beer category is over represented per state?

beer_awards %>%
  mutate(state = state.name[match(state, state.abb)]) %>%
  filter(fct_lump(state, n = 9) != "Other",
         fct_lump(category, n = 12) != "Other") %>%
  count(category, state, sort = T) %>%
  complete(state, category, fill = list(n = 0)) %>%
  bind_log_odds(state, category, n) %>%
  mutate(category = reorder_within(category, log_odds_weighted, state)) %>%
  ggplot(aes(log_odds_weighted, category, fill = log_odds_weighted > 0)) +
  geom_col() +
  scale_y_reordered() +
  facet_wrap(~state, scales = "free_y") +
  theme(legend.position = "none",
        strip.text = element_text(size = 15),
        plot.title = element_text(size = 18)) +
  labs(x = "weighted log odds",
       y = "beer category",
       title = "Which Beer Category is Over or Under Represented?")

by_year <- beer_awards %>%
  add_count(year, name = "year_total") %>%
  mutate(state = fct_lump(state, n = 9)) %>%
  filter(state != "Other") %>%
  count(state, year, year_total, sort = T) %>%
  mutate(pct_year = n/year_total) 

by_year %>%
  ggplot(aes(year, pct_year, color = state)) +
  geom_line(size = 1) +
  expand_limits(y = 0) +
  scale_y_continuous(labels = percent) +
  facet_wrap(~ state)

by_year %>%
  mutate(state = state.name[match(state, state.abb)]) %>%
  group_by(state) %>%
  summarize(model = list(glm(cbind(n, year_total - n) ~ year, family = "binomial"))) %>%
  mutate(tidied = map(model, tidy, conf.int = T)) %>%
  unnest(tidied) %>%
  filter(term == "year") %>%
  mutate(p.value = format.pval(p.value),
         state = fct_reorder(state, estimate)) %>%
  ggplot(aes(estimate, state)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low,
                     xmax = conf.high),
                 height = 0.2) +
  labs(x = "logistic estimate",
       y = NULL,
       title = "Is the # of Awards Affected by Year?",
       subtitle = "Top 9 states with the most awards estimated.")