X-men Data Visualization (Lollipop Plot included)

Thu, Jan 20, 2022 7-minute read

The data sets about X-men for this blog post come from TidyTuesday. Instead of loding them manually this time, I will use the tidytuesdayR package. You can install it by simply typing install.packages("tidytuesdayR") on your console!

Before doing any data analysis, I do need to admit that I don't know anything about X-men, meaning there is no any background information I hold. This blog post is just simply about analyzing data.

Now load the packages!

library(tidyverse)
library(tidytuesdayR)
library(ggraph)
library(tidytext)
library(cowplot)
theme_set(theme_bw())

tt_load() is the way to load the data sets.

#tuesdata <- tidytuesdayR::tt_load('2020-06-30')

Or you can read them manually.

comic_bechdel <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-30/comic_bechdel.csv')

character_visualization <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-30/character_visualization.csv')

characters <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-30/characters.csv')

xmen_bechdel <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-30/xmen_bechdel.csv')

locations <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-30/locations.csv')

Analyzing character_visualization

character_visualization  <- character_visualization %>%
  mutate(character = str_remove_all(character, "\\s.+|/.+|\\s?=.+"))
character_visualization

## # A tibble: 9,800 x 7
##    issue costume character    speech thought narrative depicted
##    <dbl> <chr>   <chr>         <dbl>   <dbl>     <dbl>    <dbl>
##  1    97 Costume Editor            0       0         0        0
##  2    97 Costume Omnipresent       0       0         0        0
##  3    97 Costume Professor         0       0         0        0
##  4    97 Costume Wolverine         7       0         0       10
##  5    97 Costume Cyclops          24       3         0       23
##  6    97 Costume Marvel            0       0         0        0
##  7    97 Costume Storm            11       0         0        9
##  8    97 Costume Colossus          9       0         0       17
##  9    97 Costume Nightcrawler     10       0         0       17
## 10    97 Costume Banshee           0       0         0        5
## # ... with 9,790 more rows

How do character features change across the issues?

character_visualization  %>%
  pivot_longer(cols = c(speech:depicted)) %>%
  group_by(issue, name, costume) %>%
  summarize(value = sum(value)) %>%
  ungroup() %>%
  ggplot(aes(issue, value, color = costume)) +
  geom_line(size = 1) +
  facet_wrap(~ name) +
  theme(strip.text = element_text(size = 13)) +
  labs(x = "issue #",
       y = "feature summation",
       color = NULL,
       title = "How does every issue differ among all features?")

X-men characters:

character_visualization %>%
  mutate(character = str_remove(character, "\\(.\\)")) %>%
  pivot_longer(cols = c(speech:depicted)) %>%
  group_by(issue, character, name) %>%
  summarize(value = sum(value)) %>%
  ungroup() %>%
  filter(value > 0) %>%
  ggplot(aes(name, character, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(high = "green",
                       low = "red",
                       midpoint = 50) +
  theme(panel.grid = element_blank(),
        axis.title = element_text(size = 12),
        axis.text.x = element_text(size = 11)) +
  labs(x = "feature",
       y = "X-men character",
       fill = "feature value",
       title = "Characters' features and their values")

acorss() is used in conjunction with summarize() with two summary functions (sum() & mean()). As I use more and more across(), it is indeed a nifty function to keep in mind when carrying out data wrangling.

by_character <- character_visualization %>%
  group_by(character) %>%
  summarize(across(speech:depicted, list(total = sum, avg = mean))) %>%
  ungroup()

Who speaks more and thinks less?

total <- by_character %>%
  filter(speech_total > 0, thought_total > 0) %>%
  ggplot(aes(speech_total, thought_total)) +
  geom_point(aes(size = speech_total/thought_total, color = speech_total/thought_total)) +
  geom_text(aes(label = character), vjust = 1, hjust = 1, check_overlap = T) +
  scale_x_log10() +
  scale_y_log10() +
  expand_limits(y = 1) +
  labs(x = "# lines of speeches",
       y = "# lines of thoughts",
       size = "speak/thought ratio",
       title = "Who speaks more and thinks less?") +
  scale_size_continuous(range = c(3,6)) +
  scale_color_continuous(guide = "none") +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 13),
        axis.text = element_text(size = 12))
  
average <- by_character %>%
  filter(thought_avg > 0, speech_avg > 0) %>%
  ggplot(aes(speech_avg, thought_avg)) +
  geom_point(aes(size = speech_avg/thought_avg, color = speech_total/thought_total)) +
  geom_text(aes(label = character), vjust = 1, hjust = 1, check_overlap = T) +
  scale_x_log10() +
  scale_y_log10() +
  expand_limits(y = 1) +
  labs(x = "average # lines of speeches",
       size = "speak/thought ratio",
       y = "average # lines of thoughts") +
  scale_size_continuous(range = c(3,6)) +
  scale_color_continuous(guide = "none")  +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 13),
        axis.text = element_text(size = 12))


plot_grid(total, average, align = c("h", "v"))

The lollipop plot below is inspired by David Robinson's code. This is my first time making such an intriguing plot!

character_visualization %>%
  group_by(costume, character) %>%
  summarize(total_speech = sum(speech)) %>%
  filter(total_speech > 0) %>%
  ungroup() %>% 
  pivot_wider(names_from = "costume", values_from = "total_speech") %>%
  janitor::clean_names() %>%
  mutate(speech_costume_ratio = costume/non_costume,
         character = fct_reorder(character, speech_costume_ratio)) %>%
  ggplot(aes(speech_costume_ratio, character, color = character)) +
  geom_point(size = 4) +
  geom_errorbarh(aes(xmin = 1, xmax = speech_costume_ratio), height = 0, size = 1) +
  scale_x_log10() +
  theme(legend.position = "none") +
  labs(x = "character with and without costume speech ratio",
       y = NULL,
       title = "Lollipop Plot")

Working on characters

Numeric columns visualization:

characters <- characters %>%
  mutate(character = str_remove_all(character, "\\s.+|/.+|\\s?=.+|\\s?,.+|\\(2\\)"))

characters %>%
  select(character, where(is.numeric)) %>%
  pivot_longer(-c(character, issue), names_to = "numeric_column") %>%
  group_by(issue, numeric_column) %>%
  summarize(value = sum(value)) %>%
  ungroup() %>%
  filter(value > 0,
         value < 1e+05) %>%
  mutate(numeric_column = str_replace_all(numeric_column, "_", " ")) %>%
  ggplot(aes(issue, value, color = numeric_column)) +
  geom_line(size = 1) +
  facet_wrap(~numeric_column) +
  theme(legend.position = "none",
        strip.text = element_text(size = 13),
        plot.title = element_text(size = 15)) +
  labs(x = "issue #",
       y = "numeric value summation",
       title = "Numeric columns' summation values")

Character graph:

set.seed(2022)
characters %>%
  select(issue, where(is.character)) %>%
  pivot_longer(-c(character, issue), names_to = "character_column") %>%
  drop_na() %>%
  filter(value != "1") %>%
  count(character, value, sort = T) %>%
  head(100) %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(width = n, color = n), alpha = 0.5) +
  geom_node_point() +
  geom_node_text(aes(label = name, color = name), hjust = 1, vjust = 1, check_overlap = T, size = 5) +
  theme_void() +
  guides(color = "none", edge_color = "none") +
  labs(edge_width = "# of character entanglements",
       title = "How are the characters linked across all issues?")

Working on locations:

locations

## # A tibble: 1,413 x 4
##    issue location                                                context notes
##    <dbl> <chr>                                                   <chr>   <chr>
##  1    97 Space                                                   Dream   <NA> 
##  2    97 X-Mansion                                               Present <NA> 
##  3    97 Rio Diablo Research Facility                            Present <NA> 
##  4    97 Kennedy International Airport                           Present <NA> 
##  5    97 Undisclosed Villain Location                            Present <NA> 
##  6    98 Rockefeller Centre                                      Present <NA> 
##  7    98 Boat in the Bahama Out Islands                          Present <NA> 
##  8    98 Sentinel Space Station (former SHIELD Orbital Platform) Present <NA> 
##  9    98 X-Mansion                                               Present <NA> 
## 10    98 Sentinel Space Station (former SHIELD Orbital Platform) Present <NA> 
## # ... with 1,403 more rows

The popular X-men locations:

locations %>%
  count(location, context, sort = T) %>%
  head(20) %>%
  mutate(location = fct_reorder(location, n, sum)) %>%
  ggplot(aes(n, location, fill = context)) +
  geom_col() +
  labs(x = "#",
       y = "",
       title = "Top 20 locations separated by context")

`comic_bechdel`

comic_bechdel

## # A tibble: 308 x 9
##    series  issue title writer artist cover_artist pass_bechdel page_number notes
##    <chr>   <dbl> <chr> <chr>  <chr>  <chr>        <chr>        <chr>       <chr>
##  1 Avenge~   105 Head~ Steve~ <NA>   John Buscema yes          19          <NA> 
##  2 Avenge~   106 A tr~ Steve~ <NA>   <NA>         no           <NA>        <NA> 
##  3 Avenge~   107 The ~ Steve~ <NA>   <NA>         no           <NA>        <NA> 
##  4 Avenge~   108 Chec~ Steve~ <NA>   <NA>         no           <NA>        <NA> 
##  5 Avenge~   109 <NA>  Steve~ Don H~ John Buscema no           <NA>        <NA> 
##  6 Avenge~   110 <NA>  Steve~ <NA>   Don Heck     no           <NA>        <NA> 
##  7 Avenge~   111 <NA>  Steve~ <NA>   <NA>         no           <NA>        <NA> 
##  8 Avenge~   112 Plus~ Steve~ <NA>   Don Heck     yes          4           <NA> 
##  9 Avenge~   113 Your~ Steve~ <NA>   <NA>         no           <NA>        <NA> 
## 10 Avenge~   114 The ~ Steve~ <NA>   Ron Wilson   yes          10          <NA> 
## # ... with 298 more rows

comic_bechdel %>%
  unnest_tokens(word, title) %>%
  anti_join(stop_words) %>%
  filter(!is.na(word)) %>%
  count(pass_bechdel, word, sort = T) %>%
  group_by(pass_bechdel) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = fct_reorder(word, n, sum)) %>%
  ggplot(aes(n, word, fill = pass_bechdel)) +
  geom_col() +
  labs(fill = "bechdel passed?",
       y = NULL,
       title = "Top 10 words from both bechdel passed and not passed group")

Writers' bechdel passing results:

comic_bechdel %>%
  separate_rows(writer, sep = ", ") %>%
  mutate(writer = str_remove(writer, "\\*")) %>%
  group_by(writer, pass_bechdel) %>%
  summarize(n = n()) %>%
  drop_na() %>%
  ungroup() %>%
  mutate(writer = fct_reorder(writer, n, sum)) %>%
  ggplot(aes(n, writer, fill = pass_bechdel)) +
  geom_col() +
  labs(x = "total # of issues",
       y = NULL,
       fill = "bechdel passed?",
       title = "How's each writer's bechdel passing?")