Netflix Visualization with LASSO on Description Words
Sat, Mar 26, 2022
5-minute read
This blog post is diving into an interesting dataset about Netflix shows. Some graphs (network plots) will be shown, and since the dataset contains text information, using LASSO model to predict some show rating is suitable.
library(tidyverse)
library(lubridate)
library(widyr)
library(tidylo)
library(tidytext)
library(ggraph)
library(glmnet)
library(broom)
theme_set(theme_bw())
netflix <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-20/netflix_titles.csv') %>%
mutate(date_added = mdy(date_added)) %>%
separate(duration, into = c("duration", "duration_unit"), sep = " ") %>%
mutate(duration_unit = str_to_lower(duration_unit),
duration = as.numeric(duration)) %>%
rename(category = listed_in) %>%
mutate(mature = rating %in% c("TV-MA", "R", "NC-17"))
netflix
## # A tibble: 7,787 x 14
## show_id type title director cast country date_added release_year rating
## <chr> <chr> <chr> <chr> <chr> <chr> <date> <dbl> <chr>
## 1 s1 TV Show 3% <NA> João~ Brazil 2020-08-14 2020 TV-MA
## 2 s2 Movie 7:19 Jorge Mic~ Demi~ Mexico 2016-12-23 2016 TV-MA
## 3 s3 Movie 23:59 Gilbert C~ Tedd~ Singap~ 2018-12-20 2011 R
## 4 s4 Movie 9 Shane Ack~ Elij~ United~ 2017-11-16 2009 PG-13
## 5 s5 Movie 21 Robert Lu~ Jim ~ United~ 2020-01-01 2008 PG-13
## 6 s6 TV Show 46 Serdar Ak~ Erda~ Turkey 2017-07-01 2016 TV-MA
## 7 s7 Movie 122 Yasir Al ~ Amin~ Egypt 2020-06-01 2019 TV-MA
## 8 s8 Movie 187 Kevin Rey~ Samu~ United~ 2019-11-01 1997 R
## 9 s9 Movie 706 Shravan K~ Divy~ India 2019-04-01 2019 TV-14
## 10 s10 Movie 1920 Vikram Bh~ Rajn~ India 2017-12-15 2008 TV-MA
## # ... with 7,777 more rows, and 5 more variables: duration <dbl>,
## # duration_unit <chr>, category <chr>, description <chr>, mature <lgl>
of Netflix shows:
netflix %>%
count(year = year(date_added), type) %>%
ggplot(aes(year, n, color = type)) +
geom_line(size = 1) +
geom_point() +
labs(x = NULL,
y = "# of shows",
color = NULL,
title = "# of Shows Added by Netflix") +
scale_x_continuous(breaks = seq(2008, 2020, 2))
Show duration:
netflix %>%
filter(!is.na(date_added)) %>%
ggplot(aes(factor(year(date_added)), duration, fill = type, color = type)) +
geom_boxplot(alpha = 0.6) +
facet_wrap(~type, ncol = 1, scales = "free_y") +
labs(x = NULL,
y = "minutes/seasons",
color = NULL,
fill = NULL,
title = "How long does movie/tv show last?")
Weighted log odds on show category per country:
netflix %>%
separate_rows(category, sep = ", ") %>%
count(country, category, sort = T) %>%
bind_log_odds(country, category, n) %>%
group_by(country = fct_lump(country, n = 10)) %>%
slice_max(abs(log_odds_weighted), n = 5, with_ties = F) %>%
ungroup() %>%
filter(country != "Other") %>%
mutate(category = reorder_within(category, log_odds_weighted, country)) %>%
ggplot(aes(log_odds_weighted, category, fill = country)) +
geom_col(show.legend = F) +
facet_wrap(~country, scales = "free_y", ncol = 2) +
scale_y_reordered() +
labs(x = "weighted log odds",
y = NULL,
title = "Which show category is most characteristic to country?")
How correlated are the categories?
set.seed(2022)
netflix %>%
separate_rows(category, sep = ", ") %>%
pairwise_cor(category, show_id, sort = T) %>%
head(200) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(color = correlation)) +
geom_node_point() +
geom_node_text(aes(label = name, color = name), repel = T) +
scale_edge_color_continuous(low = "pink",
high = "green") +
theme_void() +
guides(color = "none") +
labs(edge_width = "correlation",
title = "How are the categories correlated across shows?") +
theme(plot.title = element_text(size = 18))
added year - released year:
netflix %>%
mutate(gap_year = year(date_added) - release_year) %>%
filter(fct_lump(country, n = 9) != "Other",
gap_year > 0) %>%
ggplot(aes(gap_year, fill = type)) +
geom_histogram(alpha = 0.8) +
facet_wrap(~country) +
labs(x = "added year - released year",
fill = NULL,
title = "How long does Netflix add shows after their release?")
netflix %>%
select(show_id, type, description) %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
add_count(type, word, name = "type_word_count") %>%
add_count(word, name = "word_count") %>%
select(-show_id) %>%
pivot_longer(cols = c(type_word_count: word_count)) %>%
distinct() %>%
filter(name != "word_count") %>%
group_by(type) %>%
slice_max(value, n = 20) %>%
ungroup() %>%
mutate(word = reorder_within(word, value, type)) %>%
ggplot(aes(value, word, fill = type)) +
geom_col(show.legend = F) +
scale_y_reordered() +
facet_wrap(~type, scales = "free_y") +
labs(x = "word frequency",
y = NULL,
title = "Top 20 Most Frequent Words in Movies and TV shows")
The following idea using pairwise_cor()
on type
and title
is inspired by David Robinson’s code (link).
netflix %>%
unnest_tokens(word, description) %>%
anti_join(stop_words, by = "word") %>%
distinct(type, title, word) %>%
add_count(word, name = "word_total") %>%
filter(word_total >= 40) %>%
pairwise_cor(word, title, sort = TRUE) %>%
filter(correlation > 0.1) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(color = correlation)) +
geom_node_point() +
geom_node_text(aes(label = name, color = name), repel = T) +
scale_edge_color_continuous(low = "pink",
high = "green") +
theme_void() +
guides(color = "none") +
labs(edge_width = "correlation",
title = "How are the description words correlated across shows?") +
theme(plot.title = element_text(size = 18))
LASSO prediction on if the show has mature contents with the reference:
other_features <- netflix %>%
select(title, director, cast, category, country) %>%
pivot_longer(cols = -title, names_to = "feature_type", values_to = "feature") %>%
filter(!is.na(feature)) %>%
separate_rows(feature, sep = ", ") %>%
mutate(feature_type = str_to_title(feature_type)) %>%
unite(feature, feature_type, feature, sep = ": ") %>%
add_count(feature, name = "feature_count") %>%
filter(feature_count >= 10)
feature_matrix <- netflix %>%
unnest_tokens(word, description) %>%
anti_join(stop_words, by = "word") %>%
filter(!is.na(rating)) %>%
add_count(word, name = "word_total") %>%
filter(word_total >= 30) %>%
mutate(feature = paste("Description:", word)) %>%
bind_rows(other_features) %>%
cast_sparse(title, feature)
y <- netflix$mature[match(rownames(feature_matrix), netflix$title)]
# LASSO model
lass_model <- cv.glmnet(feature_matrix, y, family = "binomial")
lass_model %>%
.$glmnet.fit %>%
tidy() %>%
filter(term != "(Intercept)",
lambda == lass_model$lambda.1se) %>%
slice_max(abs(estimate), n = 50) %>%
mutate(term = fct_reorder(term, estimate)) %>%
ggplot(aes(estimate, term, fill = estimate > 0)) +
geom_col() +
labs(x = NULL,
y = NULL,
title = "What feature would contribute the show maturity") +
theme(legend.position = "none")
Jeffreys Interval
The following code is largely copied from David Robinson’s code with slight modification (link) for the purposes of learning Jeffreys Interval.
netflix %>%
filter(!is.na(rating), !is.na(country)) %>%
group_by(type, country = fct_lump(country, 9)) %>%
summarize(n_mature = sum(rating %in% c("R", "TV-MA", "NC-17")),
n = n(),
.groups = "drop") %>%
mutate(pct_mature = n_mature / n,
conf_low = qbeta(.025, n_mature + .5, n - n_mature + .5),
conf_high = qbeta(.975, n_mature + .5, n - n_mature + .5)) %>%
ggplot(aes(pct_mature, country, color = type)) +
geom_point(aes(size = n)) +
geom_errorbar(aes(xmin = conf_low, xmax = conf_high), width = .1) +
scale_x_continuous(labels = scales::percent) +
expand_limits(x = 0) +
labs(x = "% of titles that are R/TV-MA",
y = NULL,
size = "# of shows",
color = NULL,
title = "% of Shows that are R/TV-MA with Jeffreys Intervals per Country")