Beach Volleyball Data Visualization & Logistic Regression

Fri, Jan 7, 2022 5-minute read

In this blog post, a beach volleyball data set will be analyzed. The structure of the data set is a little different, meaning some kind of reshaping (pivot_longer or wider() and gather()) will be used frequently. The data set is from TidyTuesday.

library(tidyverse)
library(lubridate)
library(broom)
theme_set(theme_bw())
bv <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-19/vb_matches.csv") %>%
  rename(w_p1_name = "w_player1",
         w_p2_name = "w_player2",
         l_p1_name = "l_player1",
         l_p2_name = "l_player2")

Since the dataset is in a peculiar wide format, it is necessary to use pivot_longer() to reshape it. But before that, all columns that are not in character type need to be transformed to character type.

bv_longer <- bv %>%
  mutate(across(!where(is.character), as.character)) %>%
  pivot_longer(cols = c(starts_with("w"), starts_with("l"))) 


bv_longer
## # A tibble: 4,144,824 x 13
##    circuit tournament       country  year  date  gender match_num score duration
##    <chr>   <chr>            <chr>    <chr> <chr> <chr>  <chr>     <chr> <chr>   
##  1 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
##  2 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
##  3 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
##  4 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
##  5 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
##  6 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
##  7 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
##  8 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
##  9 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
## 10 AVP     Huntington Beach United ~ 2002  2002~ M      1         21-1~ 00:33:00
## # ... with 4,144,814 more rows, and 4 more variables: bracket <chr>,
## #   round <chr>, name <chr>, value <chr>

One thing we need to do for bv_longer is winner and loser should be speficied.

bv_longer <- bv_longer %>%
  mutate(winner_or_loser = if_else(str_detect(name, "^w_"), "Winner", "Loser")) 

Volleyable players' age

bv_longer %>%
  filter(str_detect(name, "age"),
         !is.na(round)) %>%
  mutate(round = fct_reorder(round, parse_number(round))) %>%
  ggplot(aes(as.numeric(value), winner_or_loser, fill = round)) +
  geom_boxplot() +
  facet_wrap(~round, ncol = 5) +
  theme(legend.position = "none") +
  labs(x = "age",
       y = "",
       title = "Age of winners and losers")

What is interesting is that there are people whose age is beyond 60 in Round 1. Other than that, the age difference between winner and loser is not obvious.

bv_longer %>%
  filter(str_detect(name, "hgt")) %>% 
  mutate(value = as.numeric(value)) %>%
  ggplot(aes(value, winner_or_loser, fill = winner_or_loser)) +
  geom_violin() +
  theme(legend.position = "none") +
  labs(x = "height (inches)",
       y = NULL,
       title = "The height of winners and losers")

The distributions of height for winners and losers are almost identical.

Game duration

bv_longer %>%
  mutate(duration = hms(duration),
         round = fct_reorder(round, parse_number(round))) %>%
  distinct(match_num, duration, round) %>% 
  filter(!is.na(round)) %>%
  ggplot(aes(minute(duration), round)) +
  geom_boxplot() +
  labs(x = "duration (minutes)",
       y = NULL,
       title = "How long does game in each round take?")

The final round

bv_longer %>%
  filter(str_detect(name, "country"),
         round == "Round 10") %>%
  count(value, winner_or_loser) %>%
  mutate(value = fct_reorder(value, n, sum)) %>%
  ggplot(aes(n, value, fill = winner_or_loser)) +
  geom_col() +
  scale_x_continuous(breaks = seq(0,13)) +
  labs(x = "# of wins or losses",
       y = NULL,
       fill = "",
       title = "Final Round (Round 10) Wins and Losses")

bv_longer_separate <- bv_longer %>%
  filter(!str_detect(name, "rank")) %>% 
  separate(name, into = c("win_or_lose","player", "variable"), sep = "_", extra = "merge", fill = "right") 
bv_spread <- bv_longer_separate %>%
  spread(key = "variable", value = "value") %>%
  type_convert()

bv_spread %>%
  summarize_all(~mean(is.na(.))) %>% 
  pivot_longer(everything()) 
## # A tibble: 26 x 2
##    name          value
##    <chr>         <dbl>
##  1 circuit    0       
##  2 tournament 0       
##  3 year       0       
##  4 date       0       
##  5 gender     0       
##  6 match_num  0       
##  7 score      0.000287
##  8 duration   0.0293  
##  9 bracket    0       
## 10 round      0.0643  
## # ... with 16 more rows
bv_spread <- bv_spread %>% 
  filter(!str_detect(score, "[:alpha:]"),
         !is.na(round)) %>%
  separate_rows(score, sep = ", ") %>%
  separate(score, into = c("win_score", "lose_score"), sep = "-", convert = TRUE) 

bv_spread %>% 
  distinct() %>%
  mutate(score_diff = win_score - lose_score,
         round = fct_reorder(round, parse_number(round))) %>% 
  ggplot(aes(abs(score_diff))) +
  geom_histogram() +
  facet_wrap(~round, scales = "free_y", ncol = 2) +
  scale_x_continuous(breaks = seq(1,20)) +
  labs(x = "score difference",
       title = "Score difference across all rounds") 

The scores are rather close, as we can see there are many games in each round whose difference is 2 or 3.

I would like to acknowledge that the following chunk of code is inspired by David Robinson's code.

by_player <- bv_spread %>% 
  group_by(name, gender) %>%
  summarize(n_matches = n(),
            pct_winner = mean(win_or_lose == "w"),
            first_game = min(date),
            last_game = max(date)) %>%
  arrange(desc(n_matches)) %>%
  ungroup() 

by_player %>%
  filter(n_matches > 100) %>%
  ggplot(aes(pct_winner, n_matches, color = gender)) +
  geom_point() +
  scale_x_log10(label = scales::percent) +
  scale_y_log10() +
  labs(x = "winning percentage",
       y = "# of matches",
       title = "The relation between winning and # of matches",
       subtitle = "Only players who played more than 100 games are considered")

Data processing

summarize_players <- . %>%
  summarize(n_matches = n(),
            pct_winner = mean(win_or_lose == "w"),
            avg_attacks = mean(tot_attacks, na.rm = TRUE),
            avg_errors = mean(tot_errors, na.rm = TRUE),
            avg_serve_errors = mean(tot_serve_errors, na.rm = TRUE),
            avg_kills = mean(tot_kills, na.rm = TRUE),
            avg_aces = mean(tot_aces, na.rm = TRUE),
            n_with_data = sum(!is.na(tot_attacks))) %>%
  ungroup() %>%
  arrange(desc(n_matches))

players_before_2019 <- bv_spread %>%
  filter(year < 2019) %>%
  group_by(name, gender, hgt, birthdate, country) %>%
  summarize_players() %>%
  filter(!is.na(avg_attacks))

players_2019 <- bv_spread %>%
  filter(year == 2019) %>%
  group_by(name, gender, hgt, birthdate, country, year,
           age = year - year(birthdate)) %>%
  summarize_players()
joined <- players_before_2019 %>%
  inner_join(players_2019 %>%
               select(name, pct_winner, n_matches),
             by = "name",
             suffix = c("", "_2019"))

joined %>% 
  ggplot(aes(pct_winner, pct_winner_2019, color = gender)) +
  geom_point() +
  geom_smooth(method = "lm", se = F, aes(group = 1)) +
  geom_text(aes(label = name), hjust = 1, vjust = 1, check_overlap = T) +
  labs(x = "wining percentage before 2019",
       y = "winning percentage in 2019",
       title = "Winning percentage before and in 2019") +
  scale_x_continuous(label = scales::percent) +
  scale_y_continuous(label = scales::percent) 

We can fit a logistic regression with cbind().

joined %>%
  mutate(n_wins_2019 = n_matches_2019 * pct_winner_2019) %>%
  glm(cbind(n_wins_2019, n_matches_2019 - n_wins_2019) ~ pct_winner + avg_errors + avg_serve_errors,
      data = .,
      family = "binomial") %>% 
  tidy() %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(y = term)) +
  geom_point(aes(x = estimate)) +
  geom_errorbarh(aes(xmin = estimate - std.error,
                 xmax = estimate + std.error)) 

Only pct_winner the pervious winning percentage before 2019 has a significant impact to the player winning percentage in 2019.