Women World Cup Visualization & Prediction
Mon, Nov 15, 2021
4-minute read
This blog post analyzes women world cup datasets from TidyTuesday, and here is the link to download the import the datasets. I am personally not into soccer, but this is an excellent opportunity to explore the datasets and hopefully my analysis can shed some light on women world cup and soccer!
Load the tidyverse
meta-package.
library(tidyverse)
theme_set(theme_bw())
Here are the datasets!
wwc_outcomes <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/wwc_outcomes.csv")
squads <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/squads.csv")
codes <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/codes.csv")
Join wwc_outcomes
and codes together
wwc_joined <- wwc_outcomes %>%
left_join(codes, by = "team") %>%
mutate(round = fct_relevel(round, "Group", "Quarter Final", "Round of 16", "Semi Final", "Third Place Playoff", "Final"))
Total wins
wwc_joined %>%
group_by(country) %>%
summarize(total_wins = sum(win_status == "Won")) %>%
ungroup() %>%
filter(total_wins > 0) %>%
mutate(country = fct_reorder(country, total_wins)) %>%
ggplot(aes(total_wins, country, fill = country)) +
geom_col(show.legend = F) +
labs(x = "total # of wins",
y = NULL,
title = "Total Wins from 1991 to 2019")
Champs
wwc_joined %>%
filter(round == "Final") %>%
group_by(country) %>%
summarize(`number of champs` = sum(win_status == "Won"),
`number of finals` = n()) %>%
ungroup() %>%
pivot_longer(-country) %>%
ggplot(aes(value, country, fill = name)) +
geom_col(position = "dodge") +
labs(x = NULL,
y = NULL,
fill = NULL,
title = "# of Times When Playing into Final & Winning the Champs")
Third Place Playoff
wwc_joined %>%
filter(round == "Third Place Playoff") %>%
group_by(country) %>%
summarize(`the third` = sum(win_status == "Won"),
`number of third place playoff` = n()) %>%
ungroup() %>%
pivot_longer(-country) %>%
ggplot(aes(value, country, fill = name)) +
geom_col(position = "dodge") +
labs(x = NULL,
y = NULL,
fill = NULL,
title = "# of Times When Getting the Third Position")
First four
wwc_joined %>%
filter(round %in% c("Third Place Playoff", "Final")) %>%
mutate(medal = case_when(
round == "Final" & win_status == "Won" ~ "Gold",
round == "Final" & win_status == "Lost" ~ "Silver",
round == "Third Place Playoff" & win_status == "Won" ~ "Broze",
round == "Third Place Playoff" & win_status == "Lost" ~ "4th",
)) %>%
mutate(medal = fct_relevel(medal, "Gold", "Silver", "Broze", "4th")) %>%
ggplot(aes(year, medal, color = country)) +
geom_point() +
geom_text(aes(label = country), hjust = 0, vjust = 1, check_overlap = T) +
theme(legend.position = "none",
panel.grid = element_blank()) +
scale_x_continuous(breaks = unique(wwc_joined$year)) +
labs(x = NULL,
y = NULL,
title = "The First Four Positions of Women's World Cup")
Predicitons
This little section is inspired by David Robinson’s code. The link is here.
outcomes <- wwc_joined %>%
group_by(year, yearly_game_id) %>%
mutate(rev_score = rev(score),
score_diff = score - rev_score) %>%
ungroup()
wwc_group_score <- outcomes %>%
filter(round == "Group") %>%
group_by(year, team) %>%
summarize(avg_group_score = mean(score, na.rm = T),
avg_score_diff = mean(score_diff, na.rm = T))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
outcomes %>%
inner_join(wwc_group_score, by = c("year", "team")) %>%
filter(round == "Final") %>%
mutate(year = paste(year, "Final")) %>%
ggplot(aes(country, avg_group_score, fill = win_status)) +
geom_col() +
facet_wrap(~year, scales = "free_x") +
theme(legend.position = c(0.8, 0.15)) +
labs(x = NULL,
y = "mean score in the Group round",
fill = NULL,
title = "Can the mean score in the Group round predict if a team won in Final?",
subtitle = "It seems like it can! (except 2011)")
Working on squads
It is interesting to see which position would score most.
squads %>%
group_by(pos) %>%
summarize(avg_goals = mean(goals, na.rm = T))
## # A tibble: 4 x 2
## pos avg_goals
## <chr> <dbl>
## 1 DF 3.07
## 2 FW 16.2
## 3 GK 0
## 4 MF 7.54
squads %>%
filter(goals > 0) %>%
mutate(pos = fct_reorder(pos, goals, median, na.rm = T)) %>%
ggplot(aes(goals, pos, fill = pos)) +
geom_boxplot(show.legend = F) +
scale_x_log10() +
labs(x = "# of goals",
y = "position")
Forward will get scores more often than other positions on the filed.
squads %>%
group_by(pos) %>%
summarize(sum = sum(goals, na.rm = T)) %>%
arrange(desc(sum)) %>%
pull(pos)
## [1] "FW" "MF" "DF" "GK"
squads %>%
mutate(pos = fct_relevel(pos, "FW", "MF", "DF", "GK")) %>%
ggplot(aes(age, goals, color = pos)) +
geom_point() +
geom_smooth() +
facet_wrap(~pos) +
theme(legend.position = "none") +
labs(title = "# of Goals V.S. Age Faceted by Position with Loess Curve")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Top 20 Players
squads %>%
arrange(desc(goals)) %>%
head(20) %>%
mutate(player = fct_reorder(player, goals),
country = fct_reorder(country, -goals, min)) %>%
ggplot(aes(goals, player, fill = country)) +
geom_col() +
labs(x = "# of goals",
y = NULL,
title = "Top 20 Players and Their Country")