Caribou Tracking Location Data Visualization

Sat, Jan 15, 2022 5-minute read

In this blog post, we will analyze caribou tracking datasets from TidyTuesday.

library(tidyverse)
library(scales)
library(lubridate)
library(glue)
theme_set(theme_bw())

individuals <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-23/individuals.csv')

locations <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-23/locations.csv')

Check the missing values for `individuals`

individuals %>%
  map_dfr(~ mean(is.na(.)) * 100) %>%
  pivot_longer(everything(), names_to = "var", values_to = "missing_percentage(%)")

## # A tibble: 14 x 2
##    var                  `missing_percentage(%)`
##    <chr>                                  <dbl>
##  1 animal_id                                0  
##  2 sex                                      0  
##  3 life_stage                              76.6
##  4 pregnant                                93.4
##  5 with_calf                               70.6
##  6 death_cause                             81.1
##  7 study_site                               0  
##  8 deploy_on_longitude                     46.5
##  9 deploy_on_latitude                      46.5
## 10 deploy_on_comments                      69.6
## 11 deploy_off_longitude                    80.4
## 12 deploy_off_latitude                     80.4
## 13 deploy_off_type                          0  
## 14 deploy_off_comments                     80.4

How's deployment type of each study site?

Based on the data introduction in the link, deployment refers to the animal was put on a location-tracking tag.

individuals %>%
  group_by(study_site, deploy_off_type) %>%
  summarize(n = n())

## # A tibble: 23 x 3
## # Groups:   study_site [8]
##    study_site  deploy_off_type     n
##    <chr>       <chr>           <int>
##  1 Burnt Pine  dead                3
##  2 Burnt Pine  other               2
##  3 Burnt Pine  removal             4
##  4 Graham      unknown            55
##  5 Hart Ranges dead                4
##  6 Hart Ranges other               3
##  7 Hart Ranges removal             4
##  8 Hart Ranges unknown            41
##  9 Kennedy     dead               15
## 10 Kennedy     other               5
## # ... with 13 more rows

by_study_site <- individuals %>%
  count(study_site, deploy_off_type, sort = T) %>%
  group_by(study_site) %>%
  mutate(percent_type_per_site = n/sum(n)) %>%
  ungroup()
  
by_study_site %>%
  ggplot(aes(percent_type_per_site, study_site, fill =deploy_off_type)) +
  geom_col() +
  scale_x_continuous(labels = percent) +
  labs(x = "percentage of type per site",
       y = "study site",
       fill = "deploy off type",
       title = "The site-wise percentage of deployment type",
       subtitle = '"Deployment" refers to when the animal was fitted with a location-tracking tag.')

locations %>%
  mutate(year = year(timestamp)) %>%
  count(year, season, sort = T) %>%
  ggplot(aes(year, n, color = season)) +
  geom_line(size = 1) +
  scale_x_continuous(breaks = seq(1990, 2020, 5)) +
  labs(x = NULL,
       y = "# of events",
       color = NULL,
       title = glue("# of measurement events from {min(year(locations$timestamp))} to {max(year(locations$timestamp))}"))

Where are the study sites located?

locations %>%
  ggplot(aes(longitude, latitude, color = study_site)) +
  geom_point() +
  labs(color = "study site",
       title = "Where are the study sites?")

Trying to obtain the maximum and minimum coordinate values of Canada before drawing a map.

canada_coords <- map_data("world") %>%
  filter(region == "Canada") %>%
  summarize(max_long = max(long), min_long = min(long),
            max_lat = max(lat), min_lat = min(lat)) 

canada_coords

##    max_long  min_long  max_lat  min_lat
## 1 -52.65366 -141.0022 83.11611 41.67485

Adding borders:

locations %>%
  ggplot(aes(longitude, latitude, color = study_site)) +
  geom_point() +
  borders("world", 
          xlim = c(canada_coords$min_long, canada_coords$max_long),
          ylim = c(canada_coords$min_lat, canada_coords$max_lat)) +
  labs(color = "study site",
       title = "Where are the caribou locations?")

We can zoom into the map a bit, because the previous map is difficult to see exactly where the study sites are located.

locations %>%
  ggplot(aes(longitude, latitude, color = study_site)) +
  geom_point() +
  borders("world", 
          xlim = c(-110, -105),
          ylim = c(55, 60)) +
  theme_void() +
  labs(color = "study site",
       title = "Where are the caribou locations?")

locations %>%
  group_by(animal_id) %>%
  summarize(max_duration = difftime(max(timestamp), min(timestamp), units = "days"),
            max_duration = as.numeric(max_duration))

## # A tibble: 260 x 2
##    animal_id max_duration
##    <chr>            <dbl>
##  1 BP_car022        1822.
##  2 BP_car023         151.
##  3 BP_car032         492.
##  4 BP_car043         548.
##  5 BP_car100         613.
##  6 BP_car101         534.
##  7 BP_car115         587.
##  8 BP_car144         150.
##  9 BP_car145         959.
## 10 GR_C01            762.
## # ... with 250 more rows

The path of a random caribou

This is my first time using geom_path() to visualize data. Let's see how it goes. This little section is inspired by David Robinson's code.

set.seed(2022)
locations %>%
  filter(animal_id == sample(animal_id, 2)) %>%
  arrange(timestamp) %>%
  ggplot(aes(longitude, latitude, color = animal_id)) +
  geom_path() +
  geom_point() +
  labs(title = "Two random caribous' tracking records")

One thing needs to remember is that geom_path() does not make sense if the data is not rearranged by timestamp for this very visualization.

Join two tibbles together

joined <- individuals %>%
  select(animal_id, study_site, deploy_off_type) %>%
  right_join(
    locations, by = c("animal_id", "study_site")
  ) %>%
  group_by(animal_id) %>%
  mutate(max_duration = difftime(max(timestamp), min(timestamp), units = "days"),
            max_duration = as.numeric(max_duration)) %>% 
  ungroup()

Maximum days of tracking

joined %>%
  distinct(animal_id, study_site, deploy_off_type, max_duration) %>%
  ggplot(aes(max_duration, deploy_off_type, fill = deploy_off_type)) +
  geom_violin(show.legend = F) +
  facet_wrap(~study_site) +
  theme(strip.text = element_text(size = 13),
        axis.text = element_text(size = 11),
        axis.title = element_text(size = 12),
        plot.title = element_text(size = 15)) +
  labs(x = "maximum days of tracking locations",
       y = "deploy off type",
       title = "Maximum days of tracking with respect to deploy off type")

Checking if the initial and final tracking locations for each animal_id are different:

joined %>%
  group_by(animal_id) %>%
  mutate(ts_min = min(timestamp),
         ts_max = max(timestamp)) %>% 
  ungroup() %>%
  filter(timestamp == ts_min | timestamp == ts_max) %>% 
  group_by(animal_id) %>%
  summarize(n = n_distinct(study_site)) %>%
  arrange(desc(n))

## # A tibble: 260 x 2
##    animal_id     n
##    <chr>     <int>
##  1 BP_car022     1
##  2 BP_car023     1
##  3 BP_car032     1
##  4 BP_car043     1
##  5 BP_car100     1
##  6 BP_car101     1
##  7 BP_car115     1
##  8 BP_car144     1
##  9 BP_car145     1
## 10 GR_C01        1
## # ... with 250 more rows

It turns out they are the same, as n = 1.

joined %>%
  group_by(animal_id) %>%
  summarize(n = n_distinct(study_site)) %>%
  arrange(desc(n))

## # A tibble: 260 x 2
##    animal_id     n
##    <chr>     <int>
##  1 BP_car022     1
##  2 BP_car023     1
##  3 BP_car032     1
##  4 BP_car043     1
##  5 BP_car100     1
##  6 BP_car101     1
##  7 BP_car115     1
##  8 BP_car144     1
##  9 BP_car145     1
## 10 GR_C01        1
## # ... with 250 more rows

These caribous have never crossed any of the study sites, as the largest n is 1.

The number of trackings

joined %>%
  mutate(year = year(timestamp),
         month = month(timestamp)) %>%
  mutate(year_month = ym(paste0(year,"-",month))) %>% 
  count(year_month, season, sort = T) %>%
  ggplot(aes(year_month, n, fill = season)) +
  geom_col() +
  scale_x_date(date_breaks = "3 years", date_labels = "%Y") +
  labs(x = "year",
       y = "# of trackings",
       title = glue("# of tracking from {min(year(joined$timestamp))} to {max(year(joined$timestamp))}"))