The Show of Friends Data Visualization

Sun, Feb 13, 2022 6-minute read

This blog post is an interesting one. It analyzes the show Friends from TidyTuesday.

library(tidyverse)
library(patchwork)
library(tidytext)
library(widyr)
library(tidylo)
theme_set(theme_light())

friends <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends.csv')

friends_emotions <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends_emotions.csv')

friends_info <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends_info.csv') %>%
  mutate(title = str_trim(str_remove_all(str_to_title(title), "The One With|The One Where")))

Working on friends_into:

How popular is each episode?

The 1st way to show:

friends_info %>%
  pivot_longer(c(7,8), names_to = "metric") %>%
  mutate(metric = str_replace_all(metric, "_", " ")) %>%
  ggplot(aes(air_date, value, color = metric)) +
  geom_line() +
  geom_text(aes(label = title), 
            check_overlap = T, size = 3, alpha = 0.5,
            hjust = 1, vjust = 1) +
  geom_point() +
  facet_wrap(~metric, scales = "free_y") +
  labs(x = "air date",
       y = NULL,
       title = "How Popular was each Friends Episode?") +
  theme(legend.position = "none",
        strip.text = element_text(size = 15),
        plot.title = element_text(size = 18))

The 2nd way to show:

friends_info %>%
  ggplot(aes(air_date, imdb_rating)) +
  geom_line() +
  geom_point(aes(size = us_views_millions, color = factor(season))) +
  geom_text(aes(label = title), 
            check_overlap = T, size = 3, alpha = 0.5,
            hjust = 1, vjust = 1) +
  scale_color_discrete(guide = "none") +
  theme(plot.title = element_text(size = 18)) +
  labs(size = "US views (millions)",
       x = "air date",
       y = "IMDB rating",
       title = "How Popular was each Friends Episode?")

Episode Director:

directed_boxplot <- function(column) {
  friends_info %>%
  add_count(directed_by) %>%
  filter(n > 5) %>%
  mutate(directed_by = fct_reorder(directed_by, {{ column }}, median, na.rm = T)) %>%
  ggplot(aes({{ column }}, directed_by)) +
  geom_boxplot()
}

directed_boxplot(imdb_rating) / directed_boxplot(us_views_millions)

There is another way to generate the faceted plot above:

plots <- map(syms(list("us_views_millions", "imdb_rating")), directed_boxplot)
reduce(plots, `/`)

friends_info %>%
  filter(fct_lump(directed_by, n = 9) != "Other") %>%
  mutate(directed_by = fct_reorder(directed_by, -us_views_millions, sum)) %>%
  ggplot(aes(us_views_millions, imdb_rating, color = factor(season))) +
  geom_point() +
  geom_text(aes(label = title), vjust = 1, hjust = 0.8, 
            check_overlap = T, size = 3.5) +
  facet_wrap(~directed_by, ncol = 5) +
  labs(x = "US views (millions)",
       y = "IMDB rating",
       color = "season",
       title = "Top 10 Directors") +
  theme(strip.text = element_text(size = 13),
        plot.title = element_text(size = 18))

Writers’ information:

friends_info %>%
  mutate(written_by = str_remove_all(written_by, "Story by.+$|Teleplay by.+$")) %>%
  separate_rows(written_by, sep = " & ") %>% 
  add_count(written_by) %>%
  filter(written_by != "",
         n > 5) %>%
  pivot_longer(cols = c(7:8)) %>%
  mutate(name = str_to_upper(str_replace_all(name, "_", " ")),
         written_by = reorder_within(written_by, value, name)) %>%
  ggplot(aes(value, written_by)) +
  geom_boxplot() +
  scale_y_reordered() +
  facet_wrap(~name, scales = "free") +
  labs(x = NULL,
       y = "episode writer",
       title = "Episode Writers' Performance") +
  theme(strip.text = element_text(size = 13),
        plot.title = element_text(size = 18))

Combining friends and friends_emotions:

friends_joined <- friends %>%
  left_join(friends_emotions, by = c("season", "episode", "scene", "utterance")) %>%
  left_join(friends_info %>% select(season, episode, imdb_rating), by = c("season", "episode")) %>%
  filter(fct_lump(speaker, n = 6) != "Other")

friends_joined

## # A tibble: 51,047 x 8
##    text               speaker season episode scene utterance emotion imdb_rating
##    <chr>              <chr>    <dbl>   <dbl> <dbl>     <dbl> <chr>         <dbl>
##  1 There's nothing t~ Monica~      1       1     1         1 <NA>            8.3
##  2 C'mon, you're goi~ Joey T~      1       1     1         2 <NA>            8.3
##  3 All right Joey, b~ Chandl~      1       1     1         3 <NA>            8.3
##  4 Wait, does he eat~ Phoebe~      1       1     1         4 <NA>            8.3
##  5 Just, 'cause, I d~ Phoebe~      1       1     1         6 <NA>            8.3
##  6 Okay, everybody r~ Monica~      1       1     1         7 <NA>            8.3
##  7 Sounds like a dat~ Chandl~      1       1     1         8 <NA>            8.3
##  8 Alright, so I'm b~ Chandl~      1       1     1        10 <NA>            8.3
##  9 Then I look down,~ Chandl~      1       1     1        12 <NA>            8.3
## 10 Instead of...?     Joey T~      1       1     1        13 <NA>            8.3
## # ... with 51,037 more rows

Who speaks the most in each season?

friends_joined %>%
  mutate(season = fct_reorder(paste("Season", season), season)) %>%
  count(speaker, season, sort = T) %>%
  mutate(speaker = reorder_within(speaker, n, season)) %>%
  ggplot(aes(n, speaker)) +
  geom_col() +
  scale_y_reordered() +
  facet_wrap(~season, scales = "free_y") +
  labs(x = "# of lines",
       y = NULL,
       title = "Who Speaks the Most?") +
  theme(strip.text = element_text(size = 15),
        plot.title = element_text(size = 18))

It’s either Ross or Chandler or Rachel who speaks the most!

friends_joined %>%
  group_by(season, episode, speaker) %>%
  summarize(episode_line_per_speaker = n(),
            imdb_rating) %>%
  ungroup() %>%
  distinct() %>%
  mutate(episode = paste0("E", episode, "(",imdb_rating, ")"),
         season = paste("Season", season)) %>%
  mutate(season = fct_reorder(season, parse_number(season)),
         episode = fct_reorder(episode, parse_number(episode))) %>%  
  ggplot(aes(episode_line_per_speaker, episode, fill = speaker)) +
  geom_col() +
  facet_wrap(~season, scales = "free_y") +
  labs(x = "episode line",
       y = NULL,
       fill = NULL,
       title = "The Overall Distribution of Lines Across All Friends Episodes",
       subtitle = "The numbers in the bracket refer to IMDB rating") +
  theme(plot.title = element_text(size = 18),
        strip.text = element_text(size = 15))

TF-IDF

friends_joined %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(!str_detect(word, "\\d")) %>%
  count(speaker, word, sort = T) %>%
  bind_tf_idf(word, speaker, n) %>%
  group_by(speaker) %>%
  slice_max(tf_idf, n = 20, with_ties = F) %>%
  ungroup() %>% 
  mutate(word = reorder_within(word, tf_idf, speaker),
         speaker = fct_reorder(speaker, -tf_idf, sum)) %>%
  ggplot(aes(tf_idf, word, fill = speaker)) +
  geom_col(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~speaker, scales = "free_y") +
  theme(strip.text = element_text(size = 15),
        plot.title = element_text(size = 18)) +
  labs(x = "TF-IDF",
       y = NULL,
       title = "Top 20 Words with the Largest TF-IDF Values for 6 Major Characters")

Weighted log odds

After being inspired by David Robinson’s code and Julia Silge’s tidylo package, I have:

friends_joined %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(!str_detect(word, "\\d")) %>%
  count(speaker, word, sort = T) %>%
  bind_log_odds(speaker, word, n) %>%
  filter(n > 20) %>%
  group_by(speaker) %>%
  slice_max(log_odds_weighted, n = 15) %>%
  mutate(word = reorder_within(word, log_odds_weighted, speaker)) %>%
  ggplot(aes(log_odds_weighted, word, fill = speaker)) +
  geom_col(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~speaker, scales = "free_y") +
  labs(x = "weighted log odds",
       y = NULL,
       title = "Weighted Log Odds for 6 Main Characters") +
  theme(strip.text = element_text(size = 15),
        plot.title = element_text(size = 18))

It seems like log_odds_weighted is analogous to TF-IDF, and it is just another way to show which words are more “targeted” from a specific speaker.

The following code is inspired by David Robinson’s cdoe to see which characters tend to appear together by using pairwise_cor() from the widyr package.

friends %>%
  filter(fct_lump(speaker, n = 6) != "Other") %>%
  unite(scene_id, season, episode, scene) %>%
  count(speaker, scene_id, sort =T) %>%
  pairwise_cor(speaker, scene_id, n) %>%
  mutate(item1 = reorder_within(item1, correlation, item2)) %>%
  ggplot(aes(correlation, item1, fill = item2)) +
  geom_col(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~item2, scales = "free_y") +
  labs(y = NULL,
       title = "Who Tends to be Appearing Together?") +
  theme(strip.text = element_text(size = 15),
        plot.title = element_text(size = 18))

Rachel and Chandler appear together least in the show.