R and R Package Download Analysis

Sun, Sep 5, 2021 6-minute read

R is a nice tool I use everyday for my data analysis work, especially those data science libraries such as tidyverse that have made my work productive. In this blog, we analyze R and R package downloads, and here is the dataset download link.

library(tidyverse)
library(lubridate)
library(scales)
library(countrycode)
theme_set(theme_bw())

r_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-30/r_downloads_year.csv") %>%
  select(-X1) %>%
  mutate(country_name = countrycode(country, "iso2c", "country.name"))

head(r_raw)

## # A tibble: 6 x 8
##   date       time         size version os    country ip_id country_name 
##   <date>     <time>      <dbl> <chr>   <chr> <chr>   <dbl> <chr>        
## 1 2017-10-23 14:29:18 78171332 3.4.2   win   ES          1 Spain        
## 2 2017-10-23 14:29:22 20692638 3.4.2   win   PT          2 Portugal     
## 3 2017-10-23 14:29:57   972075 3.4.2   win   PL          3 Poland       
## 4 2017-10-23 14:30:00  1032203 3.0.3   win   JP          4 Japan        
## 5 2017-10-23 14:30:18 78171332 3.4.2   win   CN          5 China        
## 6 2017-10-23 14:30:50 64228612 3.4.2   osx   US          6 United States

skimr::skim(r_raw)

Table 1: Data summary
Name	r_raw
Number of rows	938115
Number of columns	8
_______________________
Column type frequency:
character	4
Date	1
difftime	1
numeric	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
version	0	1.00	5	12	81
os	2598	1.00	3	3	3
country	22514	0.98	2	2	222
country_name	24672	0.97	4	42	218

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
date	0	1	2017-10-20	2018-10-20	2018-04-29	366

Variable type: difftime

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time	0	1	0 secs	86399 secs	13:35:58	86387

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
size	0	1	70506726.72	24997546.36	0	77266199	82375219	82968668	83702175	▁▁▁▁▇
ip_id	0	1	939.49	751.48	1	302	803	1434	3748	▇▅▃▁▁

Country-wise downloads

r_raw %>%
  mutate(year = year(date),
         ) %>%
  count(year, country_name, sort = TRUE) %>%
  filter(!is.na(country_name)) %>%
  mutate(
    country_name = fct_lump(country_name, n = 15, w = n),
    country_name = fct_reorder(country_name, n)
  ) %>%
  ggplot(aes(n, country_name, fill = country_name)) +
  geom_col(show.legend = FALSE)  +
  facet_wrap(~year) +
  theme(
    strip.text = element_text(size = 13)
  ) +
  scale_x_continuous(labels = comma) +
  labs(x = "# of R downloads", y = NULL, title = "Top 15 Countries R Downloads Faceted by Year")

Operating Systems

r_raw %>%
  count(version, os, sort = T) %>%
  mutate(version = fct_lump(version, n = 15, w = n),
         version = fct_reorder(version, n)) %>%
  filter(!is.na(os)) %>%
  ggplot(aes(os, version, fill = n)) +
  geom_tile() +
  theme(
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 12)
  ) +
  labs(fill = "# of R downloads", title = "# of R Downloads with Different Versions and OS Platforms")

r_raw %>%
  count(version, os, sort = T) %>%
  mutate(version = fct_lump(version, n = 15, w = n),
         version = fct_reorder(version, n)) %>%
  filter(!is.na(os)) %>%
  ggplot(aes(n, version, fill = version)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~os) +
  theme(
    strip.text = element_text(size = 13, face = "bold"),
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 12)
  ) +
  labs(x = "# of R downloads", title = "# of R Downloads with Different Versions and OS Platforms")

r_raw %>%
  group_by(os, date) %>%
  summarize(downloads = n()) %>%
  ggplot(aes(date, downloads, color = os)) +
  geom_line() +
  geom_point() +
  expand_limits(y = 0) +
  scale_x_date(date_breaks = "2 months") +
  geom_smooth(method = "lm") +
  ggtitle("Time-series Total # of R Downloads")

## `summarise()` regrouping output by 'os' (override with `.groups` argument)

## `geom_smooth()` using formula 'y ~ x'

r_raw %>%
  group_by(os, date) %>%
  summarize(downloads = n()) %>%
  ggplot(aes(date, downloads, color = factor(year(date)))) +
  geom_line() +
  geom_point() +
  expand_limits(y = 0) +
  scale_x_date(date_breaks = "2 months") +
  geom_smooth(method = "lm") +
  labs(color = NULL) +
  ggtitle("Time-series Total # of R Downloads")

## `summarise()` regrouping output by 'os' (override with `.groups` argument)

## `geom_smooth()` using formula 'y ~ x'

Package Size

r_raw %>%
  mutate(mb = size/10^6,
         version = fct_lump(version, 5)) %>% 
  ggplot(aes(os, version, fill = mb)) +
  geom_tile() +
  theme(
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 13)
  ) +
  labs(fill = "size (MB)")

Weekday

r_raw %>%
  mutate(weekday = factor(weekdays(date), levels = c("Monday", 
    "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")),
         hour = hour(time)) %>%
  group_by(weekday, hour) %>%
  summarize(downloads = n_distinct(ip_id)) %>%
  ggplot(aes(hour, downloads, color = weekday, linetype = weekday)) +
  geom_line(size = 1) +
  labs(color = NULL, linetype = NULL, y = "downloads (distinct IPs)")

## `summarise()` regrouping output by 'weekday' (override with `.groups` argument)

The following analysis is inspired by and adopted in part from David Robinson’s code, as when I was working on my analysis, I did not pay attention on ip_id column, which has some shocking number of duplicates.

r_raw %>%
  count(ip_id, sort = T)

## # A tibble: 3,748 x 2
##    ip_id     n
##    <dbl> <int>
##  1     1  7093
##  2     7  5385
##  3     2  5158
##  4     5  4477
##  5     6  3756
##  6    10  3623
##  7     4  3341
##  8    12  2369
##  9    18  2295
## 10     3  2219
## # ... with 3,738 more rows

r_raw %>%
  ggplot(aes(ip_id)) +
  geom_histogram()

In his code, Robinson Used as.POSIXlt() to put date and time together assembling timestamp column datetime, but I found out that using ymd(date) + hms(time) can do the same job, and in some senese, is easier. Then using window function lag() trying to obtain the gaps within each ip_id.

r_raw %>%
  #mutate(datetime = as.POSIXlt(date) + time) %>%
  mutate(datetime = ymd(date) + hms(time)) %>%
  arrange(datetime) %>%
  group_by(ip_id) %>%
  mutate(gap = as.numeric(datetime - lag(datetime))) %>%
  filter(!is.na(gap)) %>%
  ggplot(aes(gap)) +
  geom_histogram() +
  geom_vline(color = "red", lty = 2, xintercept = 86400) +
  scale_x_log10(breaks = 60 ^ (0:4),
                labels = c("Second", "Minute", "Hour", "2.5 Days", "120 Days")) +
  scale_y_continuous(labels = comma)

Note: First time using n_distinct() and floor_date().

r_raw %>%
  group_by(week = floor_date(date, "week", week_start = 1)) %>%
  summarize(n = n_distinct(ip_id)) %>%
  filter(week > min(week)) %>%
  ggplot(aes(week, n)) +
  geom_line() +
  expand_limits(y = 0) +
  labs(y = "# of R downloads per week (distinct IPs)")

Package Downloads

package_downloads <- read_csv("http://cran-logs.rstudio.com/2018/2018-10-27.csv.gz")

head(package_downloads)

## # A tibble: 6 x 10
##   date       time        size r_version r_arch r_os     package  version country
##   <date>     <time>     <dbl> <chr>     <chr>  <chr>    <chr>    <chr>   <chr>  
## 1 2018-10-27 22:43:31 2475672 3.5.1     x86_64 darwin1~ openssl  1.0.2   NA     
## 2 2018-10-27 22:43:32  184625 3.5.1     x86_64 darwin1~ callr    3.0.0   NA     
## 3 2018-10-27 22:43:31  266636 3.5.0     x86_64 linux-g~ fansi    0.4.0   US     
## 4 2018-10-27 22:43:32  367563 3.5.0     x86_64 linux-g~ curl     3.2     US     
## 5 2018-10-27 22:43:30   45922 3.4.1     x86_64 linux-g~ htmltoo~ 0.3.6   US     
## 6 2018-10-27 22:43:30   57664 3.5.1     x86_64 darwin1~ viridis~ 0.3.0   NA     
## # ... with 1 more variable: ip_id <dbl>

package_downloads %>%
  filter(country == "US") %>%
  group_by(package) %>%
  summarize(n = n_distinct(ip_id)) %>%
  arrange(desc(n)) %>%
  head(10) %>%
  mutate(package = fct_reorder(package, n)) %>%
  ggplot(aes(n, package, fill = package)) +
  geom_col(show.legend = F) +
  labs(x = "# of downloads")