Data Visualization on Transit Costs

Fri, Mar 4, 2022 4-minute read

In this blog post, I will analyze the cost associated with the transit projects each country spends on. The data comes from TidyTuesday.

library(tidyverse)
library(countrycode)
library(scales)
library(patchwork)
theme_set(theme_light())

Read the data and process it by renaming some columns and adding new ones.

transit_cost <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv') %>%
  distinct() %>%
  filter(length < 2500) %>%
  rename(id = e,
         railway = rr,
         length_completed = tunnel,
         cost_usd = real_cost,
         cost_km_usd = cost_km_millions) %>%
  mutate(across(5:6, as.numeric),
         project_duration = end_year - start_year + 1,
         cost_usd = as.numeric(cost_usd),
         country = fct_recode(country, "GB" = "UK"),
         country_name = countrycode(country, origin = "iso2c", destination = "country.name"))

transit_cost
## # A tibble: 535 x 22
##       id country city        line  start_year end_year railway length tunnel_per
##    <dbl> <fct>   <chr>       <chr>      <dbl>    <dbl>   <dbl>  <dbl> <chr>     
##  1  7136 CA      Vancouver   Broa~       2020     2025       0    5.7 87.72%    
##  2  7137 CA      Toronto     Vaug~       2009     2017       0    8.6 100.00%   
##  3  7138 CA      Toronto     Scar~       2020     2030       0    7.8 100.00%   
##  4  7139 CA      Toronto     Onta~       2020     2030       0   15.5 57.00%    
##  5  7144 CA      Toronto     Yong~       2020     2030       0    7.4 100.00%   
##  6  7145 NL      Amsterdam   Nort~       2003     2018       0    9.7 73.00%    
##  7  7146 CA      Montreal    Blue~       2020     2026       0    5.8 100.00%   
##  8  7147 US      Seattle     U-Li~       2009     2016       0    5.1 100.00%   
##  9  7152 US      Los Angeles Purp~       2020     2027       0    4.2 100.00%   
## 10  7153 US      Los Angeles Purp~       2018     2026       0    4.2 100.00%   
## # ... with 525 more rows, and 13 more variables: length_completed <dbl>,
## #   stations <dbl>, source1 <chr>, cost <dbl>, currency <chr>, year <dbl>,
## #   ppp_rate <dbl>, cost_usd <dbl>, cost_km_usd <dbl>, source2 <chr>,
## #   reference <chr>, project_duration <dbl>, country_name <chr>

How long does each country spend on the transit projects?

transit_cost %>%
  filter(fct_lump(country_name, n = 10) != "Other") %>%
  mutate(country_name = fct_reorder(country_name, project_duration, na.rm = T)) %>%
  ggplot(aes(project_duration, country_name)) +
  geom_boxplot(aes(fill = country_name)) +
  theme(legend.position = "none") +
  labs(x = "project duration (years)",
       y = "",
       title = "Top 10 Entities/Regions on Transit Project Duration")

of Projects each country does annually:

transit_cost %>%
  filter(fct_lump(country_name, n = 9) != "Other",
         !is.na(start_year)) %>%
  add_count(country_name, start_year, name = "num_projects") %>% 
  group_by(start_year, country_name) %>%
  mutate(avg_cost_usd = mean(cost_usd, na.rm = T)) %>%
  ungroup() %>% 
  ggplot(aes(start_year, num_projects, color = country_name)) +
  geom_line() +
  geom_point(aes(size = avg_cost_usd)) +
  facet_wrap(~country_name) +
  scale_color_discrete(guide = "none") +
  scale_size_continuous(label = dollar) +
  labs(x = "start year",
       y = "# of projects",
       size = "average annual cost (USD)",
       title = "9 Countries and Their Annual Transit Project",
       subtitle = "Annual costs are in millions")

The relationship between transit length and # of years to finish:

transit_cost %>%
  ggplot(aes(length, project_duration, color = country)) +
  geom_point(alpha = 0.7, aes(size = cost_km_usd)) +
  geom_smooth(method = "lm", aes(group = 1), se = F) +
  geom_text(aes(label = country), check_overlap = T, vjust = 1, hjust = 1) +
  scale_color_discrete(guide = "none") +
  scale_x_log10() +
  scale_size_continuous(label = dollar) +
  labs(x = "length (km)",
       y = "# of years to finish",
       size = "cost per km (USD in millions)") 

The length of the project does not necessarily indicate how long it takes to finish.

p1 <- transit_cost %>%
  group_by(start_year) %>%
  slice_max(cost_usd, n = 1) %>%
  ungroup() %>%
  ggplot(aes(start_year, cost_usd, fill = country_name)) +
  geom_col(show.legend = F) +
  geom_text(aes(label = country_name), check_overlap = T, vjust = 0, hjust = 0.5, size = 3) +
  scale_x_continuous(breaks = seq(1980, 2025, 5)) +
  scale_y_continuous(labels = dollar) +
  labs(x = "start year",
       y = "total cost (USD in millions)",
       title = "The Most Expensive Project Per Year",
       subtitle = "Being evaluated by the total cost")


p2 <- transit_cost %>%
  group_by(start_year) %>%
  slice_max(cost_km_usd, n = 1) %>%
  ungroup() %>%
  ggplot(aes(start_year, cost_km_usd, fill = country_name)) +
  geom_col(show.legend = F) +
  geom_text(aes(label = country_name), check_overlap = T, vjust = 0, hjust = 0.5, size = 3) +
  scale_x_continuous(breaks = seq(1980, 2025, 5)) +
  scale_y_continuous(labels = dollar) +
  labs(x = "start year",
       y = "average cost (USD in millions)",
       title = "The Most Expensive Project Per Year",
       subtitle = "Being evaluated by the average cost")

p1/p2

transit_cost %>%
  group_by(country, start_year) %>%
  summarize(avg_cost = mean(cost_km_usd, na.rm = T)) %>%
  ungroup() %>%
  arrange(desc(avg_cost)) %>%
  group_by(start_year) %>%
  slice_max(avg_cost, n = 1) %>%
  ungroup() %>%
  filter(!is.na(start_year)) %>%
  mutate(country_year = paste0(country, "(", start_year, ")"),
         country_year = fct_reorder(country_year, parse_number(country_year))) %>%
  ggplot(aes(avg_cost, country_year, fill = avg_cost)) +
  geom_col(show.legend = F) +
  labs(x = "average cost per km (USD in millions)",
       y = "country and year",
       title = "The Most Annual Expensive Transit Project",
       subtitle = "The cost is evaluated by USD in millions per km")

How much can each country build in a year (km)?

transit_cost %>%
  filter(tunnel_per == "100.00%") %>%
  mutate(efficiency = length/project_duration) %>%
  group_by(country_name) %>%
  summarize(avg_km = mean(efficiency, na.rm = T),
            n = n()) %>%
  filter(n > 5) %>%
  ungroup() %>%
  mutate(country_name = fct_reorder(country_name, avg_km)) %>%
  ggplot(aes(avg_km, country_name)) +
  geom_col() +
  labs(x = "average KM being constructed per year",
       y = "",
       title = "The Efficiency for Building Transit System per Country") 

China is the most efficient country and it can build more than 4 KMs per year on average.