Cocktail Visualization & PCA

Mon, Jan 10, 2022 5-minute read

In this blog post, some cocktail datasets will be analyzed. I personally don't drink and know little if not all to alcohol, but I know how to analyze data. Therefore, let me dive into the datasets in this blog post to understand more about various kinds of cocktails. Before we have the cocktail party, the datasets are from TidyTuesday.

library(tidyverse)
library(tidytext)
library(widyr)
library(reshape2)
library(ggraph)
library(igraph)
library(broom)
theme_set(theme_bw())

cocktails <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/cocktails.csv')

cocktails

## # A tibble: 2,104 x 13
##    row_id drink   date_modified       id_drink alcoholic category drink_thumb   
##     <dbl> <chr>   <dttm>                 <dbl> <chr>     <chr>    <chr>         
##  1      0 '57 Ch~ 2016-07-18 22:49:04    14029 Alcoholic Cocktail http://www.th~
##  2      0 '57 Ch~ 2016-07-18 22:49:04    14029 Alcoholic Cocktail http://www.th~
##  3      1 1-900-~ 2016-07-18 22:27:04    15395 Alcoholic Shot     http://www.th~
##  4      1 1-900-~ 2016-07-18 22:27:04    15395 Alcoholic Shot     http://www.th~
##  5      1 1-900-~ 2016-07-18 22:27:04    15395 Alcoholic Shot     http://www.th~
##  6      1 1-900-~ 2016-07-18 22:27:04    15395 Alcoholic Shot     http://www.th~
##  7      1 1-900-~ 2016-07-18 22:27:04    15395 Alcoholic Shot     http://www.th~
##  8      1 1-900-~ 2016-07-18 22:27:04    15395 Alcoholic Shot     http://www.th~
##  9      1 1-900-~ 2016-07-18 22:27:04    15395 Alcoholic Shot     http://www.th~
## 10      1 1-900-~ 2016-07-18 22:27:04    15395 Alcoholic Shot     http://www.th~
## # ... with 2,094 more rows, and 6 more variables: glass <chr>, iba <chr>,
## #   video <lgl>, ingredient_number <dbl>, ingredient <chr>, measure <chr>

boston_cocktails <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/boston_cocktails.csv')

boston_cocktails

## # A tibble: 3,643 x 6
##    name            category       row_id ingredient_number ingredient    measure
##    <chr>           <chr>           <dbl>             <dbl> <chr>         <chr>  
##  1 Gauguin         Cocktail Clas~      1                 1 Light Rum     2 oz   
##  2 Gauguin         Cocktail Clas~      1                 2 Passion Frui~ 1 oz   
##  3 Gauguin         Cocktail Clas~      1                 3 Lemon Juice   1 oz   
##  4 Gauguin         Cocktail Clas~      1                 4 Lime Juice    1 oz   
##  5 Fort Lauderdale Cocktail Clas~      2                 1 Light Rum     1 1/2 ~
##  6 Fort Lauderdale Cocktail Clas~      2                 2 Sweet Vermou~ 1/2 oz 
##  7 Fort Lauderdale Cocktail Clas~      2                 3 Juice of Ora~ 1/4 oz 
##  8 Fort Lauderdale Cocktail Clas~      2                 4 Juice of a L~ 1/4 oz 
##  9 Apple Pie       Cordials and ~      3                 1 Apple schnap~ 3 oz   
## 10 Apple Pie       Cordials and ~      3                 2 Cinnamon sch~ 1 oz   
## # ... with 3,633 more rows

Working on `boston_cocktails`

boston_cocktails %>%
  mutate(ingredient = fct_lump(ingredient, n = 10)) %>%
  filter(ingredient != "Other") %>%
  count(category, ingredient, sort = T) %>%
  ggplot(aes(category, ingredient, fill = n)) +
  geom_tile() +
  theme(axis.text.x = element_text(angle = 90),
        panel.grid = element_blank(),
        axis.ticks = element_blank()) +
  scale_fill_gradient2(high = "green",
                       low = "red",
                       mid = "pink",
                       midpoint = 50) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_discrete(expand = c(0,0)) +
  labs(fill = "ingredient count",
       title = "Top 10 ingredients count among all cocktail categories")

Dealwith with the measure column, which has the complex fractions (e.g. 1 1/2 oz). Tidyevalution is used here to transform the column into a numeric one.

boston_cocktails_oz <- boston_cocktails %>%
  filter(str_detect(measure, " oz")) %>%
  mutate(measure_oz = str_remove(measure, " oz$")) %>%
  mutate(measure_oz = str_replace(measure_oz, "\\s", "+")) %>%
  rowwise() %>%
  mutate(measure_oz = round(eval(parse(text = measure_oz)), 2)) %>%
  ungroup()

Average weight of cocktail

boston_cocktails_oz %>%
  filter(name != "Bloody Scotsman") %>%
  mutate(ingredient = fct_lump(ingredient, n = 10)) %>%
  filter(ingredient != "Other") %>% 
  group_by(category, ingredient) %>%
  summarize(total_weight = mean(measure_oz)) %>%
  ungroup() %>%
  mutate(category = fct_reorder(category, total_weight, sum),
         ingredient = fct_reorder(ingredient, total_weight, sum)) %>%
  ggplot(aes(total_weight, category, fill = ingredient)) +
  geom_col() +
  labs(x = "average weight (oz)",
       title = "Category-wise cocktail average weight")

The follwoing graph and PCA sections are followed by my other blog post about Bob Ross, and here is the link.

Graph

set.seed(2022)

boston_cocktails %>%
  add_count(ingredient) %>%
  filter(n >= 10) %>%
  pairwise_cor(ingredient, name, sort = T) %>%
  head(100) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(alpha = correlation)) +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

PCA

binary_matrix <- boston_cocktails %>%
  mutate(value = 1) %>%
  acast(name ~ ingredient)

norm_matrix <- binary_matrix - colMeans(binary_matrix)
svd_results <- svd(norm_matrix)

tidy(svd_results, "v") %>%
  mutate(ingredient = colnames(binary_matrix)[column]) %>%
  filter(PC < 5) %>%
  group_by(PC) %>%
  slice_max(abs(value), n = 20) %>%
  ungroup() %>%
  mutate(PC = paste("Principal Component", PC),
         ingredient = reorder_within(ingredient, value, PC)) %>%
  ggplot(aes(value, ingredient, fill = value > 0)) +
  geom_col(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~PC, scales = "free_y") +
  ggtitle("First 4 principal components of the ingredients")

tidy(svd_results, "u")%>%
  mutate(name = rownames(binary_matrix)[row]) %>%
  filter(PC < 5) %>%
  group_by(PC) %>%
  slice_max(abs(value), n = 20) %>%
  ungroup() %>%
  mutate(PC = paste("Principal Component", PC),
         name = reorder_within(name, value, PC)) %>%
  ggplot(aes(value, name, fill = factor(PC))) +
  geom_col(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~PC, scales = "free_y") +
  ggtitle("First 4 principal components of the cocktail names")

After exploring boston_cocktails and its friend, let's visualize cocktail, which contains more interesting data.

cocktails_oz <- cocktails %>%
  mutate(measure_oz = str_remove(measure, "\\s*oz.*"),
         measure_oz = str_replace(measure_oz, "½", "1/2"), 
         alcoholic = str_to_title(alcoholic),
         category = str_remove_all(category, "\\s*"),
         glass = str_to_title(glass)) %>% 
  filter(!str_detect(measure_oz, "[:alpha:]|\\s")) %>%
  select(-c(id_drink, drink_thumb, iba, video, date_modified)) %>%
  rowwise() %>%
  mutate(measure_oz = round(eval(parse(text = measure_oz)), 2)) %>%
  ungroup()

cocktails_oz %>%
  distinct(drink, .keep_all = T) %>%
  count(category, alcoholic, sort = T) %>%
  filter(!is.na(alcoholic)) %>%
  mutate(category = fct_reorder(category, n, sum)) %>%
  ggplot(aes(alcoholic, category, size = n, color = alcoholic)) +
  geom_point() +
  scale_size_continuous(breaks = seq(0, 400, 50), range = c(2,6)) +
  guides(color = "none") +
  labs(x = NULL,
       y = NULL,
       size = "count",
       title = "Cocktail category and alcohol relations")

As we can see, most of the cocktails are alcoholic.

Various kinds of cocktails and their weight

cocktails_oz %>%
  group_by(drink) %>%
  mutate(total_weight = sum(measure_oz)) %>%
  ungroup() %>%
  filter(total_weight > 0) %>%
  distinct(drink, alcoholic, category, glass, total_weight) %>%
  filter(total_weight < 20,
         !is.na(alcoholic)) %>%
  mutate(category = reorder_within(category, total_weight, alcoholic, median)) %>%
  ggplot(aes(total_weight, category, fill = category)) +
  geom_boxplot(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~alcoholic, scales = "free_y", ncol = 1) +
  labs(x = "weight in total (oz)",
       title = "The weight of various cocktails") +
  theme(strip.text = element_text(size = 13),
        plot.title = element_text(size = 18))

Glass types and the cocktail weights

cocktails_oz %>%
  group_by(drink) %>%
  mutate(total_weight = sum(measure_oz)) %>%
  ungroup() %>%
  filter(total_weight > 0) %>%
  distinct(drink, alcoholic, category, glass, total_weight) %>%
  group_by(drink, glass) %>%
  ungroup() %>%
  filter(total_weight < 20,
         !is.na(alcoholic)) %>%
  mutate(glass = fct_lump(glass, n = 8)) %>%
  ggplot(aes(total_weight, glass, fill = glass)) +
  geom_violin(show.legend = F) +
  facet_wrap(~alcoholic, scales = "free") +
  labs(x = "weight in total (oz)",
       y = "",
       title = "Glass types and cocktail weights")