French Train Data Processing & Visualization

Wed, Oct 6, 2021 4-minute read

The dataset of this blog post analyzes comes from TidyTuesday about train schedule delays in France across various stations.

Load the packages and the dataset.

library(tidyverse)
library(lubridate)
library(tidytext)
library(scales)
library(ggpmisc)
trains <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-02-26/full_trains.csv") %>%
  mutate(departure_station = str_to_title(departure_station),
         arrival_station = str_to_title(arrival_station),
         date = make_date(year,month))

trains
## # A tibble: 5,462 x 28
##     year month service  departure_station    arrival_station    journey_time_avg
##    <dbl> <dbl> <chr>    <chr>                <chr>                         <dbl>
##  1  2017     9 National Paris Est            Metz                           85.1
##  2  2017     9 National Reims                Paris Est                      47.1
##  3  2017     9 National Paris Est            Strasbourg                    116. 
##  4  2017     9 National Paris Lyon           Avignon Tgv                   161. 
##  5  2017     9 National Paris Lyon           Bellegarde (Ain)              164. 
##  6  2017     9 National Paris Lyon           Besancon Franche ~            129. 
##  7  2017     9 National Chambery Challes Le~ Paris Lyon                    184. 
##  8  2017     9 National Paris Lyon           Grenoble                      186. 
##  9  2017     9 National Lyon Part Dieu       Paris Lyon                    121. 
## 10  2017     9 National Paris Lyon           Macon Loche                    97.4
## # ... with 5,452 more rows, and 22 more variables: total_num_trips <dbl>,
## #   num_of_canceled_trains <dbl>, comment_cancellations <lgl>,
## #   num_late_at_departure <dbl>, avg_delay_late_at_departure <dbl>,
## #   avg_delay_all_departing <dbl>, comment_delays_at_departure <lgl>,
## #   num_arriving_late <dbl>, avg_delay_late_on_arrival <dbl>,
## #   avg_delay_all_arriving <dbl>, comment_delays_on_arrival <chr>,
## #   delay_cause_external_cause <dbl>, ...
top_6_departure_delays <- trains %>%
  group_by(departure_station) %>%
  summarize(total_delay = sum(num_late_at_departure)) %>%
  arrange(desc(total_delay)) %>%
  select(departure_station) %>%
  head(6) %>%
  pull()

trains %>%
  filter(departure_station %in% top_6_departure_delays) %>%
  group_by(date, departure_station) %>%
  summarize(total_delay = sum(num_late_at_departure)) %>%
  ungroup() %>%
  mutate(departure_station = fct_reorder(departure_station, -total_delay, sum)) %>%
  ggplot(aes(date, total_delay, color = departure_station)) +
  geom_line(size = 1) +
  labs(x = NULL,
       y = "Total # of delays",
       color = "departure station",
       title = "Top 6 Overall Departure Delay Stations")

trains %>%
  filter(departure_station %in% top_6_departure_delays) %>%
  group_by(date, departure_station) %>%
  summarize(avg_delay = mean(avg_delay_late_at_departure)) %>%
  ggplot(aes(date, avg_delay, color = departure_station)) +
  geom_line(size = 1) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  stat_poly_eq(aes(label = paste0("atop(", ..eq.label.., ",", ..rr.label.., ")")), parse = TRUE) +
  facet_wrap(~departure_station) +
  labs(x = NULL,
       y = "average delay (minutes)") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90),
        strip.text = element_text(size = 13, face = "bold"))

trains %>%
  group_by(date, arrival_station) %>%
  summarize(total_delay = sum(num_arriving_late)) %>%
  arrange(desc(total_delay)) %>%
  ungroup() %>% 
  filter(!is.na(total_delay)) %>%
  complete(date, arrival_station, fill = list(n = 0)) %>%
  # mutate(arrival_station = fct_lump(arrival_station, n = 6, w = total_delay),
  #        arrival_station = fct_reorder(arrival_station, -total_delay, sum)) %>%
  #filter(arrival_station != "Other") %>%
  ggplot(aes(date, arrival_station, fill = total_delay)) +
  geom_tile() +
  labs(x = NULL,
       y = "arrival station",
       fill = "# of total delays",
       title = "Arrival Delay Stations on # of Delays (2015 - 2018)",
       subtitle = "Grey blocks indicate missingness due to various reasons") +
  #scale_fill_viridis_c() +
  scale_fill_gradient2(high = "red",
                       low = "blue",
                       mid = "white",
                       midpoint = 500) +
  scale_y_discrete(expand = c(0,0)) +
  scale_x_date(expand = c(0,0)) +
  theme(axis.title = element_text(size = 15),
        axis.text = element_text(size = 13),
        plot.title = element_text(size = 18),
        legend.title = element_text(size = 13),
        legend.text = element_text(size = 11))

delay_cause_with_percentage <- trains %>%
  pivot_longer(cols = contains("delay_cause"), names_to = "delay_cause", values_to = "percentage") %>%
  mutate(delay_cause = str_remove(delay_cause, "delay_cause_"),
         delay_cause = str_replace(delay_cause, "_", " "),
         delay_cause = str_to_title(delay_cause))


num_month_cross <- tibble(month = c(1:12), month_abb = factor(month.abb, levels=month.abb))

delay_cause_with_percentage <- delay_cause_with_percentage %>%
  left_join(num_month_cross, by = "month")

Missing values always mess up with reorder_within(). Be mindful to check missingness if visualization goes awry once reordered.

delay_cause_with_percentage %>% 
  filter(!is.na(percentage)) %>%
  mutate(delay_cause = reorder_within(delay_cause, by = percentage, within = month_abb, fun = median)) %>% 
  ggplot(aes(x = percentage, y = delay_cause, fill = delay_cause)) +
  geom_boxplot(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~month_abb, scales = "free_y") +
  
  theme(
    strip.text = element_text(size = 13, face = "bold"),
    plot.title = element_text(size = 18)
  ) +
  scale_x_continuous(labels = percent) +
  labs(x = NULL,
       y = "delay cause",
       title = "Overall Delay Cause Per Month")

delay_cause_with_percentage %>%
  complete(year, nesting(departure_station, delay_cause), fill = list(percentage = 0)) %>%
  group_by(departure_station, year, delay_cause) %>%
  mutate(avg_delay_perc = mean(percentage, na.rm = T)) %>%
  #mutate(percentage = ifelse(percentage == 0, NA, percentage)) %>%
  ggplot(aes(delay_cause, departure_station, fill = avg_delay_perc)) +
  geom_tile() +
  theme(
    axis.text.x = element_text(angle = 90),
    strip.text = element_text(size = 15, face = "bold")
  ) +
  scale_fill_gradient2(low = "blue",
                       high = "red",
                       mid = "white",
                       midpoint = 0.162,
                       labels = percent) + 
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "delay cause",
       y = NULL,
       fill = "average delay percentage",
       title = "Departure Station Average Delay Cause Percentage") +
  facet_wrap(~year, nrow = 1)