Tweet Corpus Text Analysis

Sun, Sep 19, 2021 6-minute read

The datasets analyzed in this blog post is from TidyTuesday about tweets with hashtag #rstats and #TidyTuesday.

library(tidyverse)
library(tidytext)
library(scales)
library(hms)
library(lubridate)
library(rvest)
library(ggrepel)
theme_set(theme_light())
tt<- read_rds(url("https://github.com/rfordatascience/tidytuesday/blob/master/data/2019/2019-01-01/tidytuesday_tweets.rds?raw=true")) %>% 
  mutate(time = as_hms(created_at),
         date = date(created_at),
         account_age = interval(account_created_at, created_at)/ years(1),
         week = floor_date(date, "week", week_start = 1)) 

This is my first time loading a dataset by using read_rds. If the source is a URL, don’t forget to include url().

Checking if user_id and screen_name are one-to-one mapping.

tt %>%
  select(user_id, screen_name) %>%
  distinct() %>%
  count(user_id, screen_name, sort = T)
## # A tibble: 478 x 3
##    user_id             screen_name         n
##    <chr>               <chr>           <int>
##  1 1000252830          xenggg              1
##  2 1001511262545592320 WeAreRLadies        1
##  3 1004392971419123713 marinecdf           1
##  4 1004465754253742082 greens_wgc          1
##  5 1006621907737808896 OliXcl              1
##  6 1009356826801004544 Christi58451746     1
##  7 1010160326137057280 AllieSherier        1
##  8 1012444081          lponnala            1
##  9 1012925456717352960 bidulomique         1
## 10 1016121144          FerreMtrCrtx        1
## # ... with 468 more rows

Based on the output above, it is confirmed that they are one-to-one mapping, which means that no user changed the screen name in the time interval set by the dataset. That is to say, we can use either user_id or screen_name in our further analysis, and using both of them might be repetitious.

Tweets from different platforms

tt %>% 
  count(date, source, sort = T) %>%
  mutate(source = fct_lump(source, n = 5, w = n),
         source = fct_reorder(source, n)) %>%
  ggplot(aes(date, n, color = source)) +
  geom_line(size = 1) +
  scale_x_date(date_labels = "%y %b") +
  facet_wrap(~source) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 90)
  ) +
  labs(x = NULL, y = "# of tweets",
       title = "Tweets from Various Platforms")

Tweet hours and the relevant metrics

tt %>%
  mutate(hour = hour(time)) %>%
  group_by(hour) %>%
  summarize(`total retweets` = sum(retweet_count),
            `total likes` = sum(favorite_count),
            `total tweets` = n()) %>%
  pivot_longer(-hour, names_to = "rt_or_fav", values_to = "sum_of_hour") %>% 
  ggplot(aes(hour, sum_of_hour, color = rt_or_fav)) +
  geom_line(size = 1) +
  labs(color = NULL,
       x = "tweet hour",
       y = "# of likes/retweets",
       title = "Tweet Creation Hour with # of Likes/Retweets") +
  scale_x_continuous(breaks = seq(0, 23))

What is interesting is that for followers below 20000, there is a downward trend between the number of years since twitter account created and the number of followers. I guess the reason is that many accounts do not have active tweets, and therefore do not attract followers.

Twitter account age and followers

tt %>%
  filter(followers_count < 20000) %>%
  ggplot(aes(account_age, followers_count)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "years since account created", y = "# of followers",
       title = "The Relationship between Account Age & Follower Count") +
  scale_x_continuous(n.breaks = 7)
## `geom_smooth()` using formula 'y ~ x'

tt %>%
  ggplot(aes(account_age)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Text analysis

Others

The following code is inspired and largely adopted from David Robinson’s code.

tweet_words <- tt %>%
  select(screen_name, text, retweet_count, favorite_count, created_at, status_id,
         week) %>% 
  unnest_tokens(word, text, token = "tweets") %>%
  anti_join(stop_words, by = "word") 


word_summary <- tweet_words %>%
  #filter(!screen_name %in% c("thomas_mock", "R4DScommunity")) %>%
  group_by(word) %>%
  summarize(n = n(),
            avg_retweets = exp(mean(log(retweet_count + 1))) - 1,
            avg_favorites = exp(mean(log(favorite_count + 1))) - 1) %>%
  filter(n >= 35) %>%
  arrange(desc(avg_retweets))

week_summary <- tt %>%
  group_by(week) %>%
  summarize(tweets = n(),
            avg_retweets = exp(mean(log(retweet_count + 1))) - 1)

top_words <- tweet_words %>%
  count(word, week) %>%
  bind_tf_idf(word, week, n) %>%
  arrange(desc(tf_idf)) %>%
  distinct(week, .keep_all = TRUE) %>%
  arrange(week)

Grabbing the titles from the TidyTuesday Github page. Why typing ?html_node on RStudio console, the documentation informs me that html_node() has been emerged into html_element().

week_titles <- read_html("https://github.com/rfordatascience/tidytuesday/tree/master/data/2018") %>%
  #html_node(".entry-content") %>%
  html_element("table") %>%
  html_table() %>%
  #tbl_df() %>%
  transmute(week = floor_date(as.Date(Date), "week", week_start = 1),
            title = Data)
week_summary %>%
  inner_join(top_words, by = "week") %>%
  inner_join(week_titles, by = "week") %>%
  ggplot(aes(week, tweets)) +
  geom_line(color = "lightblue", size = 2) +
  geom_text(aes(label = title, color = title), check_overlap = TRUE) +
  expand_limits(y = 0) +
  labs(x = "week",
       y = "# of #tidytuesday tweets each week",
       title = "# of tweets about each week's #tidytuesday",
       subtitle = "Shown is the word most specific to each week (based on TF-IDF)") +
  theme(
    legend.position = "none"
  )