Space Launch Data Visualization
Wed, Sep 22, 2021
5-minute read
The datasets for this blog post come from TidyTuesday about space launches from various countries in different time intervals. There are two datasets, one is for each space launch detail, and the other one is about space launch agencies worldwide.
library(tidyverse)
library(scales)
library(lubridate)
library(tidytext)
library(patchwork)
theme_set(theme_light())
agencies <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv")
launches <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv") %>%
mutate(category = if_else(category == "O", "success", "failure"))
Data quality check
There is a typo in the dataset from the column launch_date
.
launches %>%
mutate(year = year(launch_date)) %>%
filter(year == 2918)
## # A tibble: 1 x 12
## tag JD launch_date launch_year type variant mission agency state_code
## <chr> <dbl> <date> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2018-F01 2787121. 2918-10-11 2018 Soyu~ <NA> Soyuz ~ RU RU
## # ... with 3 more variables: category <chr>, agency_type <chr>, year <dbl>
launch_date
is 2918-10-11
but launch_year
is 2018. There is a false data entry here.
Time series data visualization
state_text <- launches %>%
group_by(launch_year, state_code) %>%
summarize(n = n()) %>%
filter(n > 10) %>%
ungroup() %>%
group_by(state_code) %>%
slice_max(n, n = 1) %>%
ungroup() %>%
distinct(state_code, n, .keep_all = TRUE)
p1 <- launches %>%
count(launch_year, state_code, sort = T) %>%
filter(n > 10) %>%
mutate(state_code = fct_reorder(state_code, n)) %>%
ggplot(aes(launch_year, n, color = state_code)) +
geom_line(size = 1) +
geom_text(data = state_text, aes(label = state_code), check_overlap = T,
nudge_x = 1, nudge_y = 1,
vjust = 1,
size = 3) +
theme(
legend.position = "none"
) +
labs(x = "launch year",
y = "# of launches",
title = "Yearly # of Rocket Launches > than 10") +
expand_limits(y = 0) +
scale_x_continuous(breaks = seq(1960, 2020, by = 5))
p2 <- launches %>%
count(launch_year, agency_type, sort = T) %>%
ggplot(aes(launch_year, n, color = agency_type)) +
geom_line(size = 1) +
labs(x = "launch year",
y = "# of launches",
title = "# of Total Launches") +
scale_x_continuous(breaks = seq(1960, 2020, by = 5)) +
scale_y_continuous(breaks = seq(0, 150, 10))
p1 / p2
state_yearly_launches <- launches %>%
group_by(launch_year, state_code) %>%
summarize(n = n()) %>%
left_join(launches %>%
count(launch_year, sort = T, name = "total_launches"), by = "launch_year") %>%
mutate(state_ratio = n/total_launches) %>%
ungroup()
state_yearly_launches %>%
filter(state_code %in% state_text$state_code) %>%
mutate(state_code = fct_reorder(state_code, -state_ratio, sum)) %>%
ggplot(aes(launch_year, state_ratio, color = state_code)) +
geom_line(size = 1) +
scale_x_continuous(breaks = seq(1960, 2020, by = 5)) +
scale_y_continuous(labels = percent) +
labs(x = "launch year",
y = "yearly state percentage of rocket launches",
title = "Yearly State Rocket Launch Percentage")
Top types of rockets launched
launches %>%
count(type, state_code, sort = T) %>%
mutate(type = fct_lump(type, n = 20, w = n),
type = fct_reorder(type, n)) %>%
filter(type != "Other") %>%
ggplot(aes(n, type, fill = state_code, label = state_code, group = state_code)) +
geom_col() +
geom_text(check_overlap = T, position = position_stack(vjust = 0.5)) +
theme(
legend.position = "none",
axis.title = element_text(size = 13),
axis.text = element_text(size = 12),
plot.title = element_text(size = 17)
) +
labs(x = "# of launches",
y = NULL,
title = "Top 20 Types of Rockets Launched")
Launch Success or Failure
launches %>%
count(category, state_code, sort = T) %>%
group_by(state_code) %>%
mutate(total_state_launch = sum(n)) %>%
ungroup() %>%
mutate(success_ratio = n/total_state_launch) %>%
filter(category == "success") %>%
select(-category) %>%
mutate(state_code = fct_reorder(state_code, success_ratio)) %>%
ggplot(aes(success_ratio, state_code, fill = state_code)) +
geom_col() +
geom_text(aes(label = paste0(round(success_ratio * 100, 2), "%"),
color = state_code), hjust = -0.01) +
scale_x_continuous(labels = percent, breaks = seq(0,1, 0.25)) +
labs(x = "success launch ratio",
y = "state code",
title = "State-wise Success Launch Ratio") +
theme(
legend.position = "none"
)
State-wise agency type
agencies %>%
group_by(state_code, agency_type) %>%
summarize(count = n()) %>%
ungroup() %>%
group_by(state_code) %>%
mutate(total_state_count = sum(count),
agency_type = factor(agency_type, c("state", "private", "startup"))) %>%
ungroup() %>%
mutate(type_ratio = count/total_state_count,
state_code = reorder_within(state_code, type_ratio, agency_type)) %>%
ggplot(aes(type_ratio, state_code, fill = state_code)) +
geom_col() +
facet_wrap(~agency_type, scales = "free_y") +
scale_x_continuous(labels = percent) +
scale_y_reordered() +
theme(
legend.position = "none",
strip.text = element_text(size = 15, face = "bold")
) +
labs(x = "percentage of state-wise agency type",
y = "state code",
title = "State-wise Agency Type Percentage",
subtitle = "Percentage is computed by grouping by each cournty")
Learning resources
The following code/idea is referenced/inspired by David Robinson (here is his code).
Now focus on private + startup launches
private_startup_launches <- launches %>%
filter(agency_type %in% c("startup", "private")) %>%
inner_join(agencies %>%
select(agency, agency_name = name, short_name), by = "agency") %>%
mutate(agency_name_lumped = fct_lump(agency_name, 6),
agency_name_lumped = if_else(agency_name_lumped == "Other" & state_code == "US",
"Other US", as.character(agency_name_lumped)))
private_startup_launches %>%
count(agency_name_lumped, state_code, sort = TRUE) %>%
mutate(agency_name_lumped = fct_reorder(agency_name_lumped, n, sum)) %>%
ggplot(aes(agency_name_lumped, n, fill = state_code)) +
geom_col() +
coord_flip() +
labs(x = "",
y = "# of launches overall",
title = "What private/startup agencies have had the most launches?",
fill = "Country")
Launch per vehicle type
vehicles <- launches %>%
filter(year(launch_date) < 2020) %>%
group_by(type, state_code) %>%
summarize(first_launch = min(launch_date),
last_launch = max(launch_date),
num_of_launches = n()) %>%
ungroup()
## `summarise()` has grouped output by 'type'. You can override using the `.groups` argument.
The functions getting vehicle types for a specific country in different time intervals
country_vehicles <- function(country1, country2 = "", mid_point = 300, country_name_for_title, num_of_filter = 0){
vehicles %>%
filter(state_code == country1 | state_code == country2,
year(first_launch) < year(last_launch)) %>%
mutate(type = fct_reorder(type, first_launch, min)) %>%
filter(num_of_launches > num_of_filter) %>%
ggplot(aes(first_launch, type, color = num_of_launches, size = num_of_launches)) +
geom_point() +
geom_errorbar(aes(xmin = first_launch, xmax = last_launch), size = 1) +
scale_x_date(date_breaks = "5 years", date_labels = "%Y") +
scale_color_gradient2(low = "blue",
high = "red",
midpoint = mid_point) +
labs(x = "launching time interval",
y = "vehicle type",
title = paste(country_name_for_title, "Rocket Types in Different Time Intervals"),
color = "# of launches",
size = "# of launches")
}
Russian Vehicles
country_vehicles(country1 = "SU",country2 = "RU", country_name_for_title = "Soviet Union/Russian")
US Vehicles
country_vehicles(country1 = "US", country_name_for_title = "US", mid_point = 50, num_of_filter = 10)
Chinese Vehicles
country_vehicles(country1 = "CN", country_name_for_title = "China", mid_point = 20)