R and R Package Download Analysis

Sun, Sep 5, 2021 6-minute read

R is a nice tool I use everyday for my data analysis work, especially those data science libraries such as tidyverse that have made my work productive. In this blog, we analyze R and R package downloads, and here is the dataset download link.

library(tidyverse)
library(lubridate)
library(scales)
library(countrycode)
theme_set(theme_bw())
r_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-30/r_downloads_year.csv") %>%
  select(-X1) %>%
  mutate(country_name = countrycode(country, "iso2c", "country.name"))

head(r_raw)
## # A tibble: 6 x 8
##   date       time         size version os    country ip_id country_name 
##   <date>     <time>      <dbl> <chr>   <chr> <chr>   <dbl> <chr>        
## 1 2017-10-23 14:29:18 78171332 3.4.2   win   ES          1 Spain        
## 2 2017-10-23 14:29:22 20692638 3.4.2   win   PT          2 Portugal     
## 3 2017-10-23 14:29:57   972075 3.4.2   win   PL          3 Poland       
## 4 2017-10-23 14:30:00  1032203 3.0.3   win   JP          4 Japan        
## 5 2017-10-23 14:30:18 78171332 3.4.2   win   CN          5 China        
## 6 2017-10-23 14:30:50 64228612 3.4.2   osx   US          6 United States
skimr::skim(r_raw)
Table 1: Data summary
Name r_raw
Number of rows 938115
Number of columns 8
_______________________
Column type frequency:
character 4
Date 1
difftime 1
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
version 0 1.00 5 12 0 81 0
os 2598 1.00 3 3 0 3 0
country 22514 0.98 2 2 0 222 0
country_name 24672 0.97 4 42 0 218 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2017-10-20 2018-10-20 2018-04-29 366

Variable type: difftime

skim_variable n_missing complete_rate min max median n_unique
time 0 1 0 secs 86399 secs 13:35:58 86387

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
size 0 1 70506726.72 24997546.36 0 77266199 82375219 82968668 83702175 ▁▁▁▁▇
ip_id 0 1 939.49 751.48 1 302 803 1434 3748 ▇▅▃▁▁

Country-wise downloads

r_raw %>%
  mutate(year = year(date),
         ) %>%
  count(year, country_name, sort = TRUE) %>%
  filter(!is.na(country_name)) %>%
  mutate(
    country_name = fct_lump(country_name, n = 15, w = n),
    country_name = fct_reorder(country_name, n)
  ) %>%
  ggplot(aes(n, country_name, fill = country_name)) +
  geom_col(show.legend = FALSE)  +
  facet_wrap(~year) +
  theme(
    strip.text = element_text(size = 13)
  ) +
  scale_x_continuous(labels = comma) +
  labs(x = "# of R downloads", y = NULL, title = "Top 15 Countries R Downloads Faceted by Year")

Operating Systems

r_raw %>%
  count(version, os, sort = T) %>%
  mutate(version = fct_lump(version, n = 15, w = n),
         version = fct_reorder(version, n)) %>%
  filter(!is.na(os)) %>%
  ggplot(aes(os, version, fill = n)) +
  geom_tile() +
  theme(
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 12)
  ) +
  labs(fill = "# of R downloads", title = "# of R Downloads with Different Versions and OS Platforms") 

r_raw %>%
  count(version, os, sort = T) %>%
  mutate(version = fct_lump(version, n = 15, w = n),
         version = fct_reorder(version, n)) %>%
  filter(!is.na(os)) %>%
  ggplot(aes(n, version, fill = version)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~os) +
  theme(
    strip.text = element_text(size = 13, face = "bold"),
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 12)
  ) +
  labs(x = "# of R downloads", title = "# of R Downloads with Different Versions and OS Platforms") 

r_raw %>%
  group_by(os, date) %>%
  summarize(downloads = n()) %>%
  ggplot(aes(date, downloads, color = os)) +
  geom_line() +
  geom_point() +
  expand_limits(y = 0) +
  scale_x_date(date_breaks = "2 months") +
  geom_smooth(method = "lm") +
  ggtitle("Time-series Total # of R Downloads")
## `summarise()` regrouping output by 'os' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'

r_raw %>%
  group_by(os, date) %>%
  summarize(downloads = n()) %>%
  ggplot(aes(date, downloads, color = factor(year(date)))) +
  geom_line() +
  geom_point() +
  expand_limits(y = 0) +
  scale_x_date(date_breaks = "2 months") +
  geom_smooth(method = "lm") +
  labs(color = NULL) +
  ggtitle("Time-series Total # of R Downloads")
## `summarise()` regrouping output by 'os' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'

Package Size

r_raw %>%
  mutate(mb = size/10^6,
         version = fct_lump(version, 5)) %>% 
  ggplot(aes(os, version, fill = mb)) +
  geom_tile() +
  theme(
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 13)
  ) +
  labs(fill = "size (MB)")

Weekday

r_raw %>%
  mutate(weekday = factor(weekdays(date), levels = c("Monday", 
    "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")),
         hour = hour(time)) %>%
  group_by(weekday, hour) %>%
  summarize(downloads = n_distinct(ip_id)) %>%
  ggplot(aes(hour, downloads, color = weekday, linetype = weekday)) +
  geom_line(size = 1) +
  labs(color = NULL, linetype = NULL, y = "downloads (distinct IPs)")
## `summarise()` regrouping output by 'weekday' (override with `.groups` argument)

The following analysis is inspired by and adopted in part from David Robinson’s code, as when I was working on my analysis, I did not pay attention on ip_id column, which has some shocking number of duplicates.

r_raw %>%
  count(ip_id, sort = T)
## # A tibble: 3,748 x 2
##    ip_id     n
##    <dbl> <int>
##  1     1  7093
##  2     7  5385
##  3     2  5158
##  4     5  4477
##  5     6  3756
##  6    10  3623
##  7     4  3341
##  8    12  2369
##  9    18  2295
## 10     3  2219
## # ... with 3,738 more rows
r_raw %>%
  ggplot(aes(ip_id)) +
  geom_histogram()

In his code, Robinson Used as.POSIXlt() to put date and time together assembling timestamp column datetime, but I found out that using ymd(date) + hms(time) can do the same job, and in some senese, is easier. Then using window function lag() trying to obtain the gaps within each ip_id.

r_raw %>%
  #mutate(datetime = as.POSIXlt(date) + time) %>%
  mutate(datetime = ymd(date) + hms(time)) %>%
  arrange(datetime) %>%
  group_by(ip_id) %>%
  mutate(gap = as.numeric(datetime - lag(datetime))) %>%
  filter(!is.na(gap)) %>%
  ggplot(aes(gap)) +
  geom_histogram() +
  geom_vline(color = "red", lty = 2, xintercept = 86400) +
  scale_x_log10(breaks = 60 ^ (0:4),
                labels = c("Second", "Minute", "Hour", "2.5 Days", "120 Days")) +
  scale_y_continuous(labels = comma)

Note: First time using n_distinct() and floor_date().

r_raw %>%
  group_by(week = floor_date(date, "week", week_start = 1)) %>%
  summarize(n = n_distinct(ip_id)) %>%
  filter(week > min(week)) %>%
  ggplot(aes(week, n)) +
  geom_line() +
  expand_limits(y = 0) +
  labs(y = "# of R downloads per week (distinct IPs)")

Package Downloads

package_downloads <- read_csv("http://cran-logs.rstudio.com/2018/2018-10-27.csv.gz")

head(package_downloads)
## # A tibble: 6 x 10
##   date       time        size r_version r_arch r_os     package  version country
##   <date>     <time>     <dbl> <chr>     <chr>  <chr>    <chr>    <chr>   <chr>  
## 1 2018-10-27 22:43:31 2475672 3.5.1     x86_64 darwin1~ openssl  1.0.2   NA     
## 2 2018-10-27 22:43:32  184625 3.5.1     x86_64 darwin1~ callr    3.0.0   NA     
## 3 2018-10-27 22:43:31  266636 3.5.0     x86_64 linux-g~ fansi    0.4.0   US     
## 4 2018-10-27 22:43:32  367563 3.5.0     x86_64 linux-g~ curl     3.2     US     
## 5 2018-10-27 22:43:30   45922 3.4.1     x86_64 linux-g~ htmltoo~ 0.3.6   US     
## 6 2018-10-27 22:43:30   57664 3.5.1     x86_64 darwin1~ viridis~ 0.3.0   NA     
## # ... with 1 more variable: ip_id <dbl>
package_downloads %>%
  filter(country == "US") %>%
  group_by(package) %>%
  summarize(n = n_distinct(ip_id)) %>%
  arrange(desc(n)) %>%
  head(10) %>%
  mutate(package = fct_reorder(package, n)) %>%
  ggplot(aes(n, package, fill = package)) +
  geom_col(show.legend = F) +
  labs(x = "# of downloads")