Ramen Ratings LASSO Prediction & Visualization
Tue, Nov 9, 2021
5-minute read
This blog post analyzes ramen rating dataset from TidyTuesday. What is interesting about this blog post is that I analyzed the same dataset a few months ago and you can check it out in this link. Here I work on it again and see how much improvement I have obtained in data science since then.
library(tidyverse)
library(tidytext)
library(Matrix)
library(rvest)
library(glmnet)
library(widyr)
ramen_ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-04/ramen_ratings.csv") %>%
filter(!is.na(stars))
ramen_ratings
## # A tibble: 3,166 x 6
## review_number brand variety style country stars
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 3180 Yum Yum Tem Tem Tom Yum Moo Deng Cup Thailand 3.75
## 2 3179 Nagatanien tom Yum Kung Rice Vermi~ Pack Japan 2
## 3 3178 Acecook Kelp Broth Shio Ramen Cup Japan 2.5
## 4 3177 Maison de Coree Ramen Gout Coco Poulet Cup France 3.75
## 5 3176 Maruchan Gotsumori Shio Yakisoba Tray Japan 5
## 6 3175 Myojo Chukazanmai Tantanmen Cup Japan 3.5
## 7 3174 TIEasy Sesame Sauce Handmade N~ Pack Taiwan 3.75
## 8 3173 Sapporo Ichiban Momosan Ramen Tonkotsu Pack United St~ 5
## 9 3172 Samlip Hi-Myon Katsuo Udon Pack South Kor~ 3.5
## 10 3171 Doll Bowl Noodle Satay & Bee~ Bowl Hong Kong 4.25
## # ... with 3,156 more rows
LASSO Model: Using variety words to predict ramen rating
variety_matrix <- ramen_ratings %>%
mutate(row_id = row_number()) %>%
unnest_tokens(word, variety) %>%
anti_join(stop_words) %>%
distinct(row_id, word) %>%
add_count(word) %>%
filter(n > 10) %>%
cast_sparse(row_id, word)
# Lining up stars with row_id
row_id <- as.integer(rownames(variety_matrix))
stars <- ramen_ratings$stars[row_id]
cv_glmnet_model <- cv.glmnet(variety_matrix, stars)
plot(cv_glmnet_model)
cv_glmnet_model$glmnet.fit %>%
tidy() %>%
filter(!str_detect(term, "(Intercept)"),
lambda == cv_glmnet_model$lambda.1se) %>%
select(word = term, coefficient = estimate) %>%
mutate(direction = if_else(coefficient < 0, "negative", "positive")) %>%
group_by(direction) %>%
slice_max(abs(coefficient), n = 20) %>%
ungroup() %>%
mutate(word = fct_reorder(word, coefficient)) %>%
ggplot(aes(coefficient, word, fill = word)) +
geom_col(show.legend = F) +
labs(x = "estimated coefficient",
y = "word",
title = "How does the description word impact ramen rating?")
Ramen Style & Country on Rating
ramen_majority <- ramen_ratings %>%
filter(style %in% c("Bowl", "Cup", "Pack", "Tray"))
ramen_majority %>%
mutate(country = fct_lump(country, n = 10)) %>%
ggplot(aes(stars, country, fill = country)) +
geom_violin(show.legend = F) +
facet_wrap(~style, scales = "free_y") +
labs(y = NULL,
x = "rating",
title = "Ramen Rating for Top 10 Countries Faceted by Style",
subtitle = "Top country is defined by the # of ramen it has") +
theme(strip.text = element_text(size = 15, face = "bold"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 13),
plot.title = element_text(size = 18),
plot.subtitle = element_text(size = 14)
)
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
International ramen brands
international_brands <- ramen_majority %>%
count(brand, country, sort = T) %>%
count(brand, sort = T) %>%
filter(n > 1) %>%
pull(brand)
ramen_international <- ramen_ratings %>%
filter(brand %in% international_brands)
International brand count in each country
ramen_international %>%
filter(style != "Box") %>%
count(brand, style, country, sort = T) %>%
mutate(brand = reorder_within(brand, n, style, sum)) %>%
ggplot(aes(n, brand, fill = country)) +
geom_col() +
facet_wrap(~style, scales = "free_y") +
scale_y_reordered() +
labs(y = NULL,
x = "count",
title = "International Ramen Brand Count Per Country Faceted by Style") +
theme(strip.text = element_text(size = 15, face = "bold"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 9),
plot.title = element_text(size = 18),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8)
)
International brand rating
ramen_international %>%
filter(style != "Box") %>%
ggplot(aes(stars, brand, fill = brand)) +
geom_violin(show.legend = F) +
facet_wrap(~style, scales = "free_y") +
labs(y = NULL,
x = "rating",
title = "International Ramen Brands and Their Ratings") +
theme(strip.text = element_text(size = 15, face = "bold"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 9),
plot.title = element_text(size = 18),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8)
)
Learning from David
From here to the end of the blog post, everything is inspired/sparked by David Robinson’s code, which is about web scraping and beyond.
ramen_ratings %>%
#select(-variety) %>%
pivot_longer(-c(review_number, stars), names_to = "variable", values_to = "value") %>%
count(variable, value, sort = T) %>%
group_by(variable) %>%
slice_max(n, n = 20) %>%
ungroup() %>%
mutate(value = fct_reorder(value, n)) %>%
ggplot(aes(n, value)) +
geom_col() +
facet_wrap(~variable, scales = "free")
Based on the plot above, we can use fct_lump()
to lump some variables.
ramen_processed <- ramen_ratings %>%
mutate(brand = fct_lump(brand, 20),
country = fct_lump(country, 10),
style = fct_lump(style, 4)) %>%
replace_na(list(style = "Other")) %>%
mutate(brand = fct_relevel(brand, "Other"),
country = fct_relevel(country, "Other"),
style = fct_relevel(style, "Pack"))
Linear model plot
ramen_processed %>%
lm(stars ~ brand + style + country, data = .) %>%
tidy(conf.int = T) %>%
filter(term != "(Intercept)") %>%
extract(term, c("category", "term"), "^([a-z]+)([A-Z].*)") %>%
mutate(term = fct_reorder(term, estimate)) %>%
ggplot(aes(estimate, term, color = category)) +
geom_point() +
geom_vline(aes(xintercept = 0), linetype = 2, color = "orange") +
geom_errorbarh(aes(xmin = conf.low,
xmax = conf.high)) +
facet_wrap(~category, ncol = 1, scales = "free_y") +
theme(legend.position = "none") +
labs(y = NULL)
Web scraping
ramen_pages <- tibble(page_num = 1:10)
ramen_pages <- ramen_pages %>%
mutate(page_link = paste0("https://www.theramenrater.com/page/", page_num, "/"))
get_links <- function(link){
read_html(link) %>%
html_nodes(".entry-title a") %>%
html_attr("href")
}
ramen_pages <- ramen_pages %>%
mutate(text_link = map(page_link, get_links))
get_text <- function(link){
read_html(link) %>%
html_nodes(".entry-content p") %>%
html_text() %>%
as_tibble() %>%
rename(text = "value") %>%
filter(text != "")
}
ramen_pages <- ramen_pages %>%
unnest(text_link) %>%
mutate(text = map(text_link, get_text))
text_df <- ramen_pages %>%
mutate(text_id = row_number()) %>%
unnest(text) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("click", "finished", "enlarge", "stars", "detail")) %>%
distinct(text_id, word, .keep_all = T)
## Joining, by = "word"
word_cors <- text_df %>%
add_count(word) %>%
filter(n > 50) %>%
filter(str_detect(word, "[a-z]+")) %>%
pairwise_cor(word, text_id, sort = T)
word_cors
## # A tibble: 600 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 recipe cook 0.932
## 2 cook recipe 0.932
## 3 prepare minutes 0.910
## 4 minutes prepare 0.910
## 5 stir minutes 0.892
## 6 stir prepare 0.892
## 7 minutes stir 0.892
## 8 prepare stir 0.892
## 9 finally stir 0.892
## 10 stir finally 0.892
## # ... with 590 more rows
We can see highly correlated words from word_cors
.