Medium Article Text Visualization & LASSO Model

Tue, Sep 14, 2021 7-minute read

This blog post’s data is from TidyTuesday about articles posted on Medium.

library(tidyverse)
library(lubridate)
library(tidytext)
library(widyr)
library(glmnet)
library(broom)
library(ggraph)
library(igraph)
theme_set(theme_bw())

medium <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-04/medium_datasci.csv") %>%
  mutate(year_month = make_datetime(year = year, month = month, day = day))

head(medium)

## # A tibble: 6 x 22
##      x1 title  subtitle  image author publication  year month   day reading_time
##   <dbl> <chr>  <chr>     <dbl> <chr>  <chr>       <dbl> <dbl> <dbl>        <dbl>
## 1     2 Onlin~ Online a~     1 Emma ~ NA           2017     8     1            5
## 2     5 A.I. ? NA            0 Sanpa~ NA           2017     8     1            2
## 3    11 Futur~ From Phy~     1 Z      NA           2017     8     1            3
## 4    12 The V~ A true p~     1 Emiko~ MILLENNIAL~  2017     8     1            5
## 5    17 Os Me~ mas pera~     1 Giova~ NEW ORDER    2017     8     1            3
## 6    18 The F~ Original~     1 Syed ~ Towards Da~  2017     8     1            6
## # ... with 12 more variables: claps <dbl>, url <chr>, author_url <chr>,
## #   tag_ai <dbl>, tag_artificial_intelligence <dbl>, tag_big_data <dbl>,
## #   tag_data <dbl>, tag_data_science <dbl>, tag_data_visualization <dbl>,
## #   tag_deep_learning <dbl>, tag_machine_learning <dbl>, year_month <dttm>

skimr::skim(medium)

Table 1: Data summary
Name	medium
Number of rows	78388
Number of columns	22
_______________________
Column type frequency:
character	6
numeric	15
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
title	1822	0.98	1	230	73953
subtitle	30069	0.62	1	362	45838
author	286	1.00	1	56	33304
publication	44072	0.44	1	133	7297
url	0	1.00	36	827	78388
author_url	0	1.00	19	68	36006

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
x1	1	130632.97	147899.21	0	8180.75	30598	266207.2	437504	▇▁▁▂▂
image	1	0.65	0.48	0	0.00	1	1.0	1	▅▁▁▁▇
year	1	2017.66	0.47	2017	2017.00	2018	2018.0	2018	▅▁▁▁▇
month	1	6.21	3.31	1	3.00	6	9.0	12	▇▆▆▅▇
day	1	15.70	8.80	1	8.00	16	23.0	31	▇▇▇▇▆
reading_time	1	4.31	3.35	0	2.00	4	5.0	100	▇▁▁▁▁
claps	1	123.05	822.99	0	0.00	4	55.0	60000	▇▁▁▁▁
tag_ai	1	0.18	0.38	0	0.00	0	0.0	1	▇▁▁▁▂
tag_artificial_intelligence	1	0.38	0.48	0	0.00	0	1.0	1	▇▁▁▁▅
tag_big_data	1	0.11	0.31	0	0.00	0	0.0	1	▇▁▁▁▁
tag_data	1	0.12	0.33	0	0.00	0	0.0	1	▇▁▁▁▁
tag_data_science	1	0.20	0.40	0	0.00	0	0.0	1	▇▁▁▁▂
tag_data_visualization	1	0.06	0.23	0	0.00	0	0.0	1	▇▁▁▁▁
tag_deep_learning	1	0.08	0.28	0	0.00	0	0.0	1	▇▁▁▁▁
tag_machine_learning	1	0.32	0.47	0	0.00	0	1.0	1	▇▁▁▁▃

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
year_month	0	1	2017-08-01	2018-08-01	2018-03-01	366

Time Series Medium Articles

medium %>%
  group_by(year_month) %>%
  summarize(total = n()) %>%
  ggplot(aes(year_month, total, color = factor(year(year_month)))) +
  geom_point(show.legend = F) +
  geom_smooth(method = "lm") +
  labs(color = NULL, x = NULL, y = "# of articles", title = "# of Medium Articles Along With Time")

Reading time along with claps

medium_tags %>%
  filter(reading_time <= 60) %>%
  ggplot(aes(factor(reading_time), claps, fill = factor(reading_time))) +
  geom_boxplot() +
  scale_y_log10(labels = scales::comma) +
  coord_flip() +
  theme(
    legend.position = "none"
  ) +
  labs(x = "reading time (minutes)", y = "claps (log-scale)", title = "# of Claps with Reading Time")

Tags with claps

medium_tags %>%
  filter(tag_or_not == 1) %>%
  mutate(tag = fct_reorder(tag, claps, median, na.rm = TRUE)) %>%
  ggplot(aes(tag, claps, fill = tag)) +
  geom_boxplot() +
  scale_y_log10(labels = scales::comma) +
  coord_flip() +
  labs(y = "claps (log10 scale)", title = "Tags and Claps")

Authors with claps

medium_tags %>%
  filter(tag_or_not == 1) %>%
  group_by(author, tag) %>%
  summarize(avg_claps = mean(claps)) %>%
  #arrange(desc(avg_claps)) %>%
  distinct() %>%
  ungroup() %>%
  right_join(medium) %>% View()


top10_authors <- medium_tags %>%
  group_by(author) %>%
  summarize(avg_claps = mean(claps)) %>%
  arrange(desc(avg_claps)) %>%
  head(10) %>%
  select(author) %>%
  pull()

medium_tags %>%
  filter(author %in% top10_authors, tag_or_not == 1) %>%
  count(author, tag, sort = T)

## # A tibble: 15 x 3
##    author            tag                         n
##    <chr>             <chr>                   <int>
##  1 Fran ois Chollet  artificial_intelligence     2
##  2 Andrej Karpathy   artificial_intelligence     1
##  3 Andrej Karpathy   machine_learning            1
##  4 Anything App      ai                          1
##  5 Georges Abi-Heila data                        1
##  6 Kai Stinchcombe   ai                          1
##  7 Michael Jordan    artificial_intelligence     1
##  8 Michael Jordan    data_science                1
##  9 Michael Jordan    machine_learning            1
## 10 Radu Raicea       artificial_intelligence     1
## 11 Sophia Ciocca     artificial_intelligence     1
## 12 Sophia Ciocca     machine_learning            1
## 13 Xiaohan Zeng      machine_learning            1
## 14 YK Sugi           data_science                1
## 15 YK Sugi           data_visualization          1

Surprisingly, top 10 authors who obtained the largest numbers of claps only have only published at least 2 articles on Medium. That is to say, they might be invited to write something on the platform but are not regular bloggers.

medium_tags %>%
  filter(tag_or_not == 1) %>%
  unnest_tokens(word, title) %>%
  anti_join(stop_words) %>%
  group_by(word) %>%
  summarize(total = n()) %>%
  arrange(desc(total)) %>%
  filter(
    str_detect(word, "[a-z]")
  ) %>%
  head(25) %>%
  mutate(word = fct_reorder(word, total)) %>%
  ggplot(aes(total, word, fill = word)) +
  geom_col(show.legend = F) +
  labs(x = "word count", y = "title word", title = "Top 25 Most Frequent Article Title Words")

medium_tags %>%
  filter(tag_or_not == 1) %>%
  unnest_tokens(word, subtitle) %>%
  anti_join(stop_words) %>%
  group_by(word) %>%
  summarize(total = n()) %>%
  arrange(desc(total)) %>%
  filter(
    str_detect(word, "[a-z]")
  ) %>%
  head(25) %>%
  mutate(word = fct_reorder(word, total)) %>%
  filter(!is.na(word)) %>%
  ggplot(aes(total, word, fill = word)) +
  geom_col(show.legend = F) +
  labs(x = "word count", y = "subtitle word", title = "Top 25 Most Frequent Article Subtitle Words")

Images and Claps

medium %>%
  mutate(image = factor(image)) %>%
  ggplot(aes(image, claps, fill = image)) +
  geom_boxplot() +
  scale_y_log10(labels = scales::comma) +
  theme(
    legend.position = "none"
  ) +
  labs(x = "contains image or not", y = "# of claps", title = "# of Claps with Whether Articles Containing Image or Not")

The following graph and LASSO model is adopted from David Robinson’s code with some slight modification.

medium_tags_graph <- medium_tags %>%
  filter(tag_or_not == 1) %>%
  mutate(article_id = row_number()) %>%
  select(title, tag, article_id, claps) %>% 
  unnest_tokens(word, title) %>%
  anti_join(stop_words) %>%
  add_count(word) %>% 
  filter(n > 500, str_detect(word, "[a-z]"))

word_cor <- medium_tags_graph %>%
  pairwise_cor(word, article_id, sort = TRUE)

vertices <- medium_tags_graph %>% 
  group_by(word) %>%
  summarize(median_claps = median(claps),
            geometric_mean_claps = exp(mean(log(claps + 1))) - 1,
            occurences = n()) %>%
  arrange(desc(median_claps)) %>%
  filter(word %in% word_cor$item1 |
         word %in% word_cor$item2)

word_cor %>% 
  head(50) %>% 
  graph_from_data_frame(vertices = vertices) %>%
  ggraph(layout = "fr") +
  geom_edge_link() +
  geom_node_point(aes(size = occurences,
                      color = geometric_mean_claps)) +
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_color_gradient2(low = "red", high = "blue", midpoint = 20) +
  theme_void() +
  labs(title = "Claps with Titles",
       subtitle = "Color shows the geometric mean of # of claps on articles with this word in the title",
       size = "# of occurrences",
       color = "claps")

LASSO Model

sparse_matrix <- medium_tags_graph %>%
  distinct(article_id, word, claps) %>%
  cast_sparse(article_id, word)

claps <- medium_tags_graph$claps[match(rownames(sparse_matrix), as.character(medium_tags_graph$article_id))]

lasso_model <- cv.glmnet(sparse_matrix, log(claps + 1))

tidy(lasso_model$glmnet.fit) %>%
  filter(term %in% c("ai", "learning", "world", "deep", "google", "marketing")) %>% 
  ggplot(aes(lambda, estimate, color = term)) +
  geom_line(size = 1) +
  scale_x_log10()