Tennis Grand Slam Visualization & Winning Tournament Prediction

Thu, Oct 21, 2021 4-minute read

The datasets in this blog post are about 4 Tennis Grand Slams and their winners across various decades since 1968 from TidyTuesday.

library(tidyverse)
library(tidytext)
timeline <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/grand_slam_timeline.csv")

grand_slams <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/grand_slams.csv") %>%
  mutate(grand_slam = str_replace(grand_slam, "_", " "),
         grand_slam = str_to_title(grand_slam)) %>%
  mutate(grand_slam = fct_recode(grand_slam, "US Open" = "Us Open"))

player_dob <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/player_dob.csv") %>%
  mutate(first_title_age = age/365) %>%
  select(-age)

Working on timeline tibble

timeline %>%
  count(year, tournament, gender, sort = T) %>%
  ggplot(aes(year, n, color = gender)) +
  geom_line(size = 1) +
  facet_wrap(~tournament) +
  labs(x = NULL,
       y = "player count",
       color = NULL,
       title = "Yearly # of Male & Female Players Attending Tournaments") +
  theme(
    strip.text = element_text(size = 15, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 13),
    legend.text = element_text(size = 10),
    plot.title = element_text(size = 16)
  )

timeline %>% 
  count(player, outcome, gender, sort = T) %>% 
  drop_na() %>%
  filter(!str_detect(outcome, "Round|Qua")) %>%
  group_by(outcome) %>%
  slice_max(n, n = 10) %>%
  ungroup() %>%
  mutate(player = reorder_within(player, n, outcome)) %>%
  ggplot(aes(n, player, fill = gender)) +
  geom_col() +
  facet_wrap(~outcome, scales = "free_y") +
  scale_y_reordered()+
  labs(x = "count",
       y = NULL,
       fill = NULL,
       title = "Top 10 Players in All Outcomes") +
  theme(
    strip.text = element_text(size = 15, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 13),
    legend.text = element_text(size = 10),
    plot.title = element_text(size = 18)
  )

Working on grand_slams&player_dob

Join two tibbles together.

joined_tbl <- grand_slams %>%
  rename(tournament = "grand_slam") %>%
  left_join(player_dob, by = "name") %>%
  mutate(tournament_age = round(as.numeric(difftime(tournament_date ,date_of_birth))/365, 1),
         first_title_age = round(first_title_age, 1)) %>%
  mutate(grand_slam = if_else(str_detect(grand_slam, "Australian Open"), "Australian Open", grand_slam))
 

joined_tbl %>%
  filter(rolling_win_count == 1) %>%
  slice_min(first_title_age, n = 20) %>% 
  mutate(name = fct_reorder(name, first_title_age)) %>%
  ggplot(aes(first_title_age, name, fill = grand_slam)) +
  geom_col() +
  geom_text(aes(label = year, color = grand_slam),nudge_x = 0.3, show.legend = F) +
  labs(x = "first title age",
       y = NULL,
       fill = "tournament",
       title = "The Youngest 20 Players Winning Grand Slam Title") 

joined_tbl %>%
  group_by(name) %>%
  slice_max(rolling_win_count, n = 1) %>%
  #filter(rolling_win_count > 5) %>% 
  ungroup() %>%
  drop_na() %>%
  mutate(name = reorder_within(name, rolling_win_count, gender)) %>%
  ggplot(aes(rolling_win_count, name, fill = grand_slam)) +
  geom_col() +
  facet_wrap(~gender, scales = "free") +
  scale_y_reordered() +
  labs(x = "total win count",
       y = "",
       fill = "tournament")+
  theme(
    strip.text = element_text(size = 15, face = "bold"),
    axis.title = element_text(size = 13),
    axis.text = element_text(size = 11),
    legend.text = element_text(size = 10),
    plot.title = element_text(size = 18)
  )

Predicting the winner of tournament

The following code is directly copied from David Robinson’s code with a few minor modifications. This is just for learning purposes, as this is my first time using a number of functions presented in the code.

dob <- player_dob %>%
  select(player = name, date_of_birth)

tournaments <- grand_slams %>%
  select(year, tournament = grand_slam, gender, tournament_date)

timeline_processed <- timeline %>%
  inner_join(tournaments, by = c("year", "tournament", "gender")) %>%
  arrange(player, tournament_date) %>%
  filter(outcome != "Absent",
         !str_detect(outcome, "Qualif")) %>%
  group_by(player) %>%
  mutate(rolling_play_count = row_number() - 1,
         rolling_won_count = lag(cumsum(outcome == "Won"), default = 0),
         rolling_finals_count = lag(cumsum(outcome %in% c("Won", "Finalist")), default = 0)) %>% 
  ungroup() %>%
  filter(!(year == 1977 & tournament == "Australian Open")) %>%
  mutate(won = outcome == "Won")

timeline_processed %>%
  filter(outcome %in% c("Finalist", "Won")) %>%
  arrange(tournament_date) %>%
  group_by(rolling_won_count = pmin(rolling_won_count, 10)) %>%
  summarize(pct_won = mean(won),
            observations = n()) %>%
  ggplot(aes(rolling_won_count, pct_won)) +
  geom_line() +
  expand_limits(y = 0)

Using dput() to save a lot of mundane typing!

match() is nifty here to create ranking.

outcome_rankings <- c("1st Round", "2nd Round", "3rd Round", "4th Round", "Quarterfinalist", 
"Retired", "Semi-finalist", "Finalist", "Won")

tournament_scores <- timeline_processed %>%
  filter(outcome %in% outcome_rankings) %>%
  mutate(score_contribution = match(outcome, outcome_rankings)) %>% 
  group_by(player) %>%
  mutate(previous_average = lag(cummean(score_contribution), default = 1)) %>%
  ungroup() %>% 
  mutate(previous_performance = outcome_rankings[round(previous_average)],
         previous_performance = fct_relevel(previous_performance, outcome_rankings))
tournament_scores %>%
  group_by(previous_performance) %>%
  summarize(observations = n(),
            probability_win = mean(won)) %>%
  ggplot(aes(previous_performance, probability_win, group = 1)) +
  geom_line() +
  scale_y_continuous(labels = scales::percent_format()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "Player's average previous performance",
       y = "Probability of winning tournament",
       title = "Does past performance in Grand Slams predict future success?",
       subtitle = "Treating rounds as if they can be averaged linearly")