NCAA Women's Basketball Data Visualiztion

Wed, Feb 23, 2022 4-minute read

The NCAA dataset is from TidyTuesday.

library(tidyverse)
library(scales)
theme_set(theme_light())

tournament <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-06/tournament.csv') %>%
  mutate(tourney_finish = factor(tourney_finish, levels = c("1st", "2nd", "RSF", "RF", "NSF", "N2nd", "Champ")))

tournament

## # A tibble: 2,092 x 19
##     year school      seed conference conf_w conf_l conf_percent conf_place reg_w
##    <dbl> <chr>      <dbl> <chr>       <dbl>  <dbl>        <dbl> <chr>      <dbl>
##  1  1982 Arizona S~     4 Western C~     NA     NA         NA   -             23
##  2  1982 Auburn         7 Southeast~     NA     NA         NA   -             24
##  3  1982 Cheyney        2 Independe~     NA     NA         NA   -             24
##  4  1982 Clemson        5 Atlantic ~      6      3         66.7 4th           20
##  5  1982 Drake          4 Missouri ~     NA     NA         NA   -             26
##  6  1982 East Caro~     6 Independe~     NA     NA         NA   -             19
##  7  1982 Georgia        5 Southeast~     NA     NA         NA   -             21
##  8  1982 Howard         8 Mid-Easte~     NA     NA         NA   -             14
##  9  1982 Illinois       7 Big Ten        NA     NA         NA   -             21
## 10  1982 Jackson S~     7 Southwest~     NA     NA         NA   -             28
## # ... with 2,082 more rows, and 10 more variables: reg_l <dbl>,
## #   reg_percent <dbl>, how_qual <chr>, x1st_game_at_home <chr>,
## #   tourney_w <dbl>, tourney_l <dbl>, tourney_finish <fct>, full_w <dbl>,
## #   full_l <dbl>, full_percent <dbl>

How many school teams per conference over the years?

tournament %>%
  filter(fct_lump(conference, n = 9) != "Other") %>%
  count(year, conference, sort = T) %>%
  mutate(conference = fct_reorder(conference, -n, sum)) %>%
  ggplot(aes(year, n, color = conference)) +
  geom_line() +
  geom_point() +
  facet_wrap(~conference) +
  theme(legend.position = "none",
        plot.title = element_text(size = 18),
        strip.text = element_text(size = 15)) +
  scale_y_continuous(breaks = seq(1,10)) +
  labs(x = NULL,
       y = "# of teams",
       title = "How Many Teams per Conference?")

The highest overall winning percentage per year:

tournament %>%
  group_by(year) %>%
  slice_max(full_percent, n = 1) %>%
  ungroup() %>%
  ggplot(aes(year, full_percent)) +
  geom_point(aes(color = tourney_finish), size = 3) +
  geom_line() +
  geom_text(aes(label = school), hjust = 0, vjust = 1, check_overlap = T) +
  scale_y_continuous(label = percent_format(scale = 1)) +
  scale_x_continuous(breaks = seq(min(tournament$year), max(tournament$year), 2)) +
  labs(x = NULL,
       y = "overall winning percentage",
       color = "tournament ended",
       title = "Yearly Team with the Highest Winning Percentage") +
  theme(axis.text.x = element_text(angle = 90))

Champs:

tournament %>%
  filter(tourney_finish == "Champ") %>%
  count(conference, how_qual, sort = T) %>%
  mutate(conference = fct_reorder(conference, n, sum)) %>%
  ggplot(aes(n, conference, fill = how_qual)) +
  geom_col() +
  scale_x_continuous(breaks = seq(1,10, 2)) +
  labs(x = "# of champs",
       y = NULL, 
       fill = "how qualified",
       title = "# of Champs per Conference")

tournament %>%
  filter(full_l < 20) %>% 
  mutate(tourney_finish = if_else(tourney_finish == "Champ", "Champ", "Non-Champ")) %>%
  ggplot(aes(full_w, full_l, color = tourney_finish)) +
  geom_point(aes(size = if_else(tourney_finish == "Champ", 4, 3))) +
  facet_wrap(~year) +
  scale_size_continuous(guide = NULL) +
  labs(x = "# of wins",
       y = "# of losses",
       color = "",
       title = "How did Champ Perform per Year?")

Seeds:

tournament %>%
  filter(!is.na(tourney_finish)) %>%
  group_by(seed, tourney_finish) %>%
  summarize(n = n()) %>%
  ungroup() %>% 
  group_by(seed) %>%
  mutate(nn = sum(n),
         percentage = n/nn) %>% 
  ungroup() %>%
  ggplot(aes(tourney_finish, seed, fill = percentage)) +
  geom_tile() +
  geom_text(aes(label = paste0(round(percentage * 100), "%")), check_overlap = T) +
  scale_y_reverse(breaks = seq(1,16), expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  scale_fill_gradient(high = "red",
                      low = "green",
                      label = percent) +
  theme(panel.grid = element_blank()) +
  labs(x = "tournament finished at ...",
       y = "seed #",
       fill = NULL,
       title = "How Likely Each Seed Will Be Finished at the Tournament?")

The following code is inspired by David Robinson’s code

tournament %>%
  mutate(conference = fct_lump(conference, 8)) %>%
  group_by(conference) %>%
  summarize(n = n(),
            seed = mean(seed, na.rm = T),
            pct_win = mean(tourney_finish == "Champ", na.rm = T),
            pct_final_four = mean(tourney_finish %in% c("Champ", "N2nd", "NSF"))) %>%
  ungroup() %>%
  ggplot(aes(pct_final_four, pct_win)) +
  geom_point(aes(size = seed)) +
  geom_text(aes(label = conference), check_overlap = T, hjust = 0, vjust = 1) +
  scale_size_continuous(breaks = seq(1,10),
                        labels = seq(10,1)) +
  scale_x_continuous(labels = percent) +
  scale_y_continuous(labels = percent) +
  labs(x = "final four admission",
       y = "champ winning percentage",
       title = "How Likely each Conference Entering into Final Four and Winning Champ")