Analyzing Beyonce and Taylor Swift Lyrics
Sat, Feb 19, 2022
5-minute read
This blog post analyzes the lyrics from Beyonce and Taylor Swift, and the datasets are from TidyTuesday.
library(tidyverse)
library(lubridate)
library(scales)
library(tidytext)
library(tidylo)
theme_set(theme_light())
beyonce_lyrics <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/beyonce_lyrics.csv')
taylor_swift_lyrics <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/taylor_swift_lyrics.csv') %>%
rename(song = Title) %>%
janitor::clean_names()
sales <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/sales.csv')
charts <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/charts.csv') %>%
mutate(re_release = str_remove(re_release, "\\[.+\\]$"),
released = str_remove(released, "\\(.+$")) %>%
mutate(released = mdy(released),
re_release = mdy(re_release),
chart_position = as.numeric(chart_position)) %>%
rename(country_chart = chart,
album = title)
Chart positions per country:
charts %>%
ggplot(aes(released, chart_position, color = album)) +
geom_line(aes(group = artist), size = 1) +
geom_point() +
geom_text(aes(label = album), hjust = 0, vjust = 1, check_overlap = T) +
facet_grid(country_chart ~ artist) +
scale_y_reverse() +
scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
theme(legend.position = "none",
strip.text = element_text(size = 15),
plot.title = element_text(size = 18)) +
labs(x = "released year",
y = "chart position",
title = "Country-wise Album Chart Position")
Album formats:
charts %>%
separate_rows(formats, sep = ",\\s+") %>%
group_by(artist, album, formats) %>%
summarize(n = n()) %>%
ungroup() %>%
mutate(album = fct_reorder(album, n, sum)) %>%
ggplot(aes(n, album, fill = formats)) +
geom_col() +
facet_wrap(~artist, scales = "free_y") +
scale_x_continuous(breaks = seq(0, 80, 10)) +
labs(x = "# of formats",
fill = "format",
title = "Albums Distribution Formats across Multiple Countries")
Working on sales
:
sales %>%
filter(country == "US") %>%
mutate(title = fct_reorder(title, sales),
artist = fct_reorder(artist, -sales, sum)) %>%
ggplot(aes(sales, title, fill = artist)) +
geom_col() +
scale_x_continuous(labels = dollar) +
labs(x = NULL,
y = "album",
fill = NULL,
title = "Sales Volume per Album")
Working on taylor_swift_lyrics
Weighted log odds on Album words:
taylor_swift_lyrics %>%
unnest_tokens(word, lyrics) %>%
anti_join(stop_words) %>%
count(album, word, sort = T) %>%
bind_log_odds(album, word, n) %>%
group_by(album) %>%
slice_max(log_odds_weighted, n = 5) %>%
ungroup() %>%
mutate(word = reorder_within(word, log_odds_weighted, album)) %>%
ggplot(aes(log_odds_weighted, word, fill = album)) +
geom_col(show.legend = F) +
scale_y_reordered() +
facet_wrap(~album, scales = "free_y") +
labs(x = "weighted log odds",
y = NULL,
title = "Top 5 Lyrics Words with the Largest Weighted Log Odds",
subtitle = "For all ablums") +
theme(strip.text = element_text(size = 15),
plot.title = element_text(size = 17))
Weighted log odds on song words:
taylor_swift_lyrics %>%
mutate(lyrics_length = str_length(lyrics)) %>%
slice_max(lyrics_length, n = 9) %>%
select(-lyrics_length) %>%
unnest_tokens(word, lyrics) %>%
anti_join(stop_words) %>%
count(song, word, sort = T) %>%
bind_log_odds(song, word, n) %>%
group_by(song) %>%
slice_max(log_odds_weighted, n = 5, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder_within(word, log_odds_weighted, song),
song = fct_reorder(song, -log_odds_weighted, sum)) %>%
ggplot(aes(log_odds_weighted, word, fill = song)) +
geom_col(show.legend = F) +
scale_y_reordered() +
facet_wrap(~song, scales = "free_y") +
labs(x = "weighted log odds",
y = NULL,
title = "Top 5 Lyrics Words with the Largest Weighted Log Odds",
subtitle = "For 9 most lengthy songs") +
theme(strip.text = element_text(size = 13),
plot.title = element_text(size = 17))
beyonce_lyrics
analysis:
beyonce_processed <- beyonce_lyrics %>%
distinct(line, .keep_all = T) %>%
select(-contains("artist")) %>%
mutate(song_name = str_remove_all(song_name, "\\s?\\(.+$|\\s?\\[.+$")) %>%
filter(song_id != 2985000)
beyonce_processed
## # A tibble: 10,791 x 4
## line song_id song_name song_line
## <chr> <dbl> <chr> <dbl>
## 1 If I ain't got nothing, I got you 50396 1+1 1
## 2 If I ain't got something, I don't give a damn 50396 1+1 2
## 3 'Cause I got it with you 50396 1+1 3
## 4 I don't know much about algebra, but I know 1+1 ~ 50396 1+1 4
## 5 And it's me and you 50396 1+1 5
## 6 That's all we'll have when the world is through 50396 1+1 6
## 7 'Cause baby, we ain't got nothing without love 50396 1+1 7
## 8 Darling, you got enough for the both of us 50396 1+1 8
## 9 So come on, baby, make love to me 50396 1+1 9
## 10 When my days look low 50396 1+1 10
## # ... with 10,781 more rows
beyonce_processed
has many same songs with various song_id
, and the reason is these same songs have multiple versions (i.e., some are sung at studio, others at conference).
beyonce_joined <- beyonce_processed %>%
inner_join(
beyonce_processed %>%
select(song_id, song_name) %>%
distinct(song_name, .keep_all = T),
by = c("song_name", "song_id")
)
beyonce_joined
## # A tibble: 8,701 x 4
## line song_id song_name song_line
## <chr> <dbl> <chr> <dbl>
## 1 If I ain't got nothing, I got you 50396 1+1 1
## 2 If I ain't got something, I don't give a damn 50396 1+1 2
## 3 'Cause I got it with you 50396 1+1 3
## 4 I don't know much about algebra, but I know 1+1 ~ 50396 1+1 4
## 5 And it's me and you 50396 1+1 5
## 6 That's all we'll have when the world is through 50396 1+1 6
## 7 'Cause baby, we ain't got nothing without love 50396 1+1 7
## 8 Darling, you got enough for the both of us 50396 1+1 8
## 9 So come on, baby, make love to me 50396 1+1 9
## 10 When my days look low 50396 1+1 10
## # ... with 8,691 more rows
beyonce_joined
only contains each song with one unique song_id
.
beyonce_joined %>%
distinct(song_name, song_id) %>%
count(song_name, sort = T)
## # A tibble: 259 x 2
## song_name n
## <chr> <int>
## 1 "\"Self-Titled\" Part 1 . The Visual Album" 1
## 2 "\"Self-Titled\" Part 2 . Imperfection" 1
## 3 "***Flawless" 1
## 4 "<U+200B>come home" 1
## 5 "<U+200B>war" 1
## 6 "<U+200E>blind trust" 1
## 7 "1+1" 1
## 8 "6 Inch" 1
## 9 "7/11" 1
## 10 "A Woman Like Me" 1
## # ... with 249 more rows
Now we can see confirm it.
Sentiment analysis:
beyonce_joined %>%
unnest_tokens(word, line) %>%
anti_join(stop_words) %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(song_name) %>%
summarize(value = mean(value)) %>%
ungroup() %>%
group_by(value > 0) %>%
slice_max(abs(value), n = 5, with_ties = F) %>%
ungroup() %>%
mutate(song_name = fct_reorder(song_name, value)) %>%
ggplot(aes(value, song_name, fill = value > 0)) +
geom_col() +
theme(legend.position = "none") +
labs(x = "AFINN mean value",
y = NULL,
title = "The 5 Happiest and Saddest Beyonce Songs based on AFINN")