Vidoe Games Data Visualization
Sun, Mar 20, 2022
5-minute read
This blog post analyzes vidoe games time-series data from TidyTuesday. I personally never play video games and know nothing about them, but through data analysis in this post, I can use data to shed some light on what video games are popular over time and which games gained popularity during the COVID lockdown (March 2020).
Load the packages:
library(tidyverse)
library(lubridate)
library(scales)
library(tidytext)
theme_set(theme_bw())
Load and clean up the data:
games <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-16/games.csv') %>%
rename(avg_player_num = avg,
gain_from_pre_month = gain) %>%
mutate(avg_peak_perc = as.numeric(str_remove(avg_peak_perc, "%"))) %>%
left_join(tibble(month = month.name,
month_num = 1:12),
by = "month") %>%
select(-month) %>%
rename(month = month_num,
avg_peak_pct = avg_peak_perc,
peak_num = peak) %>%
relocate(month, .after = "year") %>%
mutate(date = make_date(year, month),
gain_from_pre_month = replace_na(gain_from_pre_month, 0),
gamename = str_to_title(gamename))
games
## # A tibble: 83,631 x 8
## gamename year month avg_player_num gain_from_pre_m~ peak_num avg_peak_pct
## <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Counter-St~ 2021 2 741013. -2196. 1123485 66.0
## 2 Dota 2 2021 2 404832. -27840. 651615 62.1
## 3 Playerunkn~ 2021 2 198958. -2290. 447390 44.5
## 4 Apex Legen~ 2021 2 120983. 49216. 196799 61.5
## 5 Rust 2021 2 117742. -24375. 224276 52.5
## 6 Team Fortr~ 2021 2 101231. 18083. 133620 75.8
## 7 Grand Thef~ 2021 2 90648. -10603. 146438 61.9
## 8 Tom Clancy~ 2021 2 72383. -5335. 113338 63.9
## 9 Rocket Lea~ 2021 2 53723. -5726. 103429 51.9
## 10 Path Of Ex~ 2021 2 46920. -766. 90539 51.8
## # ... with 83,621 more rows, and 1 more variable: date <date>
Average # of players and peak # of players:
games %>%
group_by(date) %>%
summarize(across(c(avg_player_num, peak_num), mean),
n = n()) %>%
rename(`average player number` = avg_player_num,
`peak player number` = peak_num) %>%
pivot_longer(cols = 2:3) %>%
ggplot(aes(date, value, color = name)) +
geom_line() +
geom_vline(xintercept = as.Date("2020-04-01"),
lty = 2,
color = "red") +
geom_point(aes(size = n)) +
scale_size_continuous(range = c(1,3)) +
scale_x_date(date_breaks = "6 months",
date_labels = "%y-%m") +
labs(x = NULL,
y = "# of players",
color = NULL,
size = "# of games",
title = "Monthly # of Game Players",
subtitle = "The vertical line is the time when COVID lockdown taking place")
Top 5 games with most players at the same time per year:
games %>%
group_by(year, gamename) %>%
slice_max(peak_num, n = 1) %>%
ungroup() %>%
group_by(year) %>%
slice_max(peak_num, n = 5) %>%
ungroup() %>%
mutate(gamename = reorder_within(gamename, peak_num, year)) %>%
ggplot(aes(peak_num, gamename, fill = factor(month))) +
geom_col() +
scale_y_reordered() +
facet_wrap(~year, scales = "free_y") +
theme(axis.text.x = element_text(angle = 90),
strip.text = element_text(size = 15),
plot.title = element_text(size = 18)) +
labs(x = "peak player number",
y = NULL,
fill = "month",
title = "Top 5 Games on Peak Numbers")
Games with most gains:
games %>%
filter(gain_from_pre_month != 0) %>%
group_by(date) %>%
slice_max(abs(gain_from_pre_month), n = 1, with_ties = F) %>%
ungroup() %>%
mutate(group = if_else(gain_from_pre_month > 0, "pos", "neg")) %>%
ggplot(aes(date, gain_from_pre_month, color = group)) +
geom_line() +
geom_point() +
geom_text(aes(label = gamename), check_overlap = T,
hjust = 1, vjust = 1) +
theme(legend.position = "none") +
labs(x = NULL,
y = "gain from the previous month",
title = "The Game with the Most Gains per Year and month",
subtitle = "Both positive and negative are included")
Games with the largest average/peak:
games %>%
group_by(date) %>%
slice_max(avg_peak_pct, n = 1) %>%
ungroup() %>%
ggplot(aes(date, avg_peak_pct)) +
geom_point(aes(size = peak_num)) +
geom_line() +
geom_text(aes(label = gamename),
check_overlap = T,
vjust = 1,
hjust = 1,
size = 3) +
scale_y_continuous(labels = percent_format(scale = 1)) +
labs(x = NULL,
y = "avg / peak (%)",
size = "peak number",
title = "The Game with the Largest Average/Peak per Year per month") +
expand_limits(x = as.Date("2010-01-01"))
Dota 2
games %>%
filter(gamename == "Dota 2") %>%
ggplot(aes(date, peak_num)) +
geom_line() +
scale_x_date(date_breaks = "1 year",
date_labels = "%Y") +
labs(x = NULL,
y = "peak # of players",
title = "Dota 2 Peak # of Players")
games %>%
filter(fct_lump(gamename, n = 9, w = avg_player_num) != "Other") %>%
mutate(gamename = fct_reorder(gamename, -avg_player_num, sum)) %>%
ggplot(aes(date, avg_player_num, color = gamename)) +
geom_line(show.legend = F, size = 1) +
facet_wrap(~gamename, scales = "free_y") +
scale_y_continuous(labels = comma) +
labs(x = NULL,
y = "average # of players",
title = "Top 9 Games with the Largest Average # of Players")
Which games got popular during the COVID lockdown?
games %>%
filter(date >= "2018-01-01") %>%
semi_join(
games %>%
filter(date == "2020-03-01") %>%
slice_max(gain_from_pre_month, n = 9),
by = "gamename"
) %>%
mutate(gamename = fct_reorder(gamename, -avg_player_num, max)) %>%
ggplot(aes(date, avg_player_num, color = gamename)) +
geom_line(show.legend = F) +
geom_vline(xintercept = as.Date("2020-03-01"),
color = "red",
lty = 2) +
facet_wrap(~gamename, scales = "free_y") +
labs(x = NULL,
y = "average # of players",
title = "Top 9 Most Gained Games during COVID Lockdown")
games %>%
group_by(date, gain_pos = gain_from_pre_month > 0) %>%
summarize(n = n()) %>%
ungroup() %>%
add_count(date, wt = n, name = "total_game") %>%
filter(gain_pos) %>%
select(-gain_pos) %>%
mutate(pct_gain = n/total_game) %>%
ggplot(aes(date, pct_gain)) +
geom_line() +
scale_y_continuous(labels = percent) +
scale_x_date(date_breaks = "1 year",
date_labels = "%Y") +
labs(x = "",
y = "% of games gained users",
title = "% of Games that Gained Users")
It seems like there is a seasonable pattern among the % of games that gained users.
games %>%
mutate(month = month(date, label = T),
pos_gain = if_else(gain_from_pre_month > 0, TRUE, FALSE)) %>%
group_by(month) %>%
summarize(avg_pos_gain = mean(pos_gain)) %>%
ggplot(aes(month, avg_pos_gain)) +
geom_line(group = 1) +
scale_y_continuous(labels = percent) +
labs(x = NULL,
y = "% of games gained users",
title = "% of Games Gained Users per Month")