Visualization on Himalayan Climbing Expeditions
Tue, Feb 15, 2022
4-minute read
This blog post anlayzes Himalayan Climbing data coming from TidyTuesday.
library(tidyverse)
library(tidytext)
library(widyr)
library(ggraph)
theme_set(theme_light())
members <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
expeditions <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/expeditions.csv') %>%
mutate(termination_reason = str_to_title(str_remove_all(termination_reason, "\\s\\(.*\\)|or\\s"))) %>%
separate_rows(termination_reason, sep = ", ") %>%
mutate(success_or_not = if_else(termination_reason == "Success", "success", "not success"))
peaks <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/peaks.csv') %>%
separate_rows(first_ascent_country, sep = ",\\s")
Working on members
:
members %>%
distinct(expedition_id, year) %>%
count(year, name = "num_of_climbs") %>%
left_join(
members %>%
group_by(expedition_id, year) %>%
summarize(n = n()) %>%
ungroup() %>%
count(year, wt = n, name = "num_of_climbers"),
by = "year") %>%
ggplot(aes(year, num_of_climbs)) +
geom_line() +
geom_point(aes(size = num_of_climbers)) +
scale_x_continuous(breaks = seq(1900, 2020, 10)) +
scale_size_continuous(range = c(1, 6), breaks = c(100, 200, 500, 1000, 2000)) +
labs(x = NULL,
y = "# of climbs",
size = "# of yearly climbers",
title = "Yearly # of Climbs and # of Climbers in Total") +
theme(plot.title = element_text(size = 18))
Both # of climbs and # of climbers have increased dramatically in recent years.
Seasons:
members %>%
filter(season != "Unknown") %>%
distinct(expedition_id, year, season) %>%
count(year, season, sort = T) %>%
mutate(season = fct_reorder(season, -n, sum)) %>%
ggplot(aes(year, n, color = season)) +
geom_line(size = 1) +
labs(x = NULL,
y = "# of expeditions",
title = "Yearly # of Expeditions per Season") +
theme(plot.title = element_text(size = 18))
Both fall and spring have attracted much more climbers than other two seasons.
members %>%
filter(fct_lump(citizenship, n = 10) != "Other",
!is.na(sex),
!is.na(age)) %>%
mutate(sex = fct_recode(sex,
"Female" = "F",
"Male" = "M")) %>%
mutate(citizenship = reorder_within(citizenship, age, sex, median)) %>%
ggplot(aes(age, citizenship, fill = citizenship)) +
geom_boxplot(show.legend = F) +
scale_y_reordered() +
facet_wrap(~sex, scales = "free_y") +
labs(title = "Age Distributions among All Climbers from Top 10 Countries",
subtitle = "Top countries are defined by # of climbers") +
theme(strip.text = element_text(size = 15),
plot.title = element_text(size = 18))
members %>%
filter(highpoint_metres > 0) %>%
distinct(year, highpoint_metres) %>%
group_by(year) %>%
summarize(highpoint = max(highpoint_metres, na.rm = T)) %>%
ggplot(aes(year, highpoint)) +
geom_line() +
geom_point() +
expand_limits(y = 0) +
labs(y = "highpoint (meters)",
title = "The Annual Highest Point Achieved")
Graph on countries:
set.seed(2022)
members %>%
count(citizenship, peak_name, sort = T) %>%
filter(n > 1000) %>%
pairwise_cor(citizenship, peak_name, n, sort = T) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width = correlation, color = correlation), alpha = 0.5) +
geom_node_point() +
geom_node_text(aes(label = name), hjust = 1, vjust = -1, check_overlap = T, size = 5) +
theme_void() +
guides(color = "none", edge_color = "none") +
labs(edge_color = "correlation",
title = "How are countries correlated?") +
theme(plot.title = element_text(size = 18))
Death:
members %>%
filter(died,
fct_lump(expedition_role, n = 3) != "Other") %>%
count(death_cause, season, expedition_role, sort = T) %>%
mutate(death_cause = reorder_within(death_cause, n, expedition_role)) %>%
ggplot(aes(n, death_cause, fill = season)) +
geom_col() +
facet_wrap(~expedition_role, scales = "free_y") +
scale_y_reordered() +
labs(x = "# of deaths",
y = "death cause",
title = "The Reasons Why Climbers were Killed")
Working on expeditions
:
What are the termination reasons?
expeditions %>%
filter(fct_lump(termination_reason, n = 16) != "Other") %>%
count(year, termination_reason, sort = T) %>%
mutate(termination_reason = fct_reorder(termination_reason, -n, sum)) %>%
ggplot(aes(year, n, color = termination_reason)) +
geom_line() +
geom_point() +
facet_wrap(~termination_reason, ncol = 5) +
theme(legend.position = "none") +
labs(y = "# of expeditions terminated in this reason",
title = "How did Expeditions be Terminated?")
of members and success:
expeditions %>%
ggplot(aes(members, success_or_not, fill = success_or_not)) +
geom_boxplot(show.legend = F) +
scale_x_log10() +
labs(x = "# of members",
y = "expediton result",
title = "More Team Members, Higher Chances for Success?")
It looks like successful teams had more median members.
expeditions %>%
filter(season %in% c("Autumn", "Spring")) %>%
mutate(date = difftime(highpoint_date, basecamp_date, units = "days")) %>%
ggplot(aes(date, fill = success_or_not)) +
geom_histogram(alpha = 0.8) +
facet_wrap(~season) +
labs(x = "days",
fill = NULL,
title = "How Far Apart between Basecamp Date and Highpoint Date?")
Working on peaks
:
peaks %>%
arrange(desc(height_metres),
first_ascent_year) %>%
head(11) %>%
mutate(first_ascent_country = fct_reorder(first_ascent_country, height_metres),
peak_name = fct_reorder(peak_name, -height_metres)) %>%
ggplot(aes(first_ascent_country, height_metres, fill = peak_name)) +
geom_col(position = "dodge") +
geom_text(aes(label = first_ascent_year), hjust = 1, check_overlap = T) +
coord_flip() +
labs(y = "height (meters)",
x = "first ascent country",
fill = "peak name",
title = "Top 10 Peaks being Climbed")