IKEA Visualization & Modeling
Sun, Feb 27, 2022
3-minute read
This blog post analyzes the IKEA dataset from TidyTuesday.
library(tidyverse)
library(tidylo)
library(tidytext)
library(scales)
library(widyr)
library(ggraph)
theme_set(theme_light())
ikea <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv') %>%
select(-1) %>%
mutate(name = str_to_title(name),
old_price = if_else(old_price == "No old price", price, parse_number(old_price)))
The typical furniture names per category:
ikea %>%
count(name, category, sort = T) %>%
bind_log_odds(category, name, n) %>%
group_by(category) %>%
slice_max(log_odds_weighted, n = 10, with_ties = F) %>%
ungroup() %>%
mutate(name = reorder_within(name, log_odds_weighted, category)) %>%
ggplot(aes(log_odds_weighted, name, fill = category)) +
geom_col() +
scale_y_reordered() +
facet_wrap(~category, scales = "free_y") +
theme(legend.position = "none") +
labs(x = "weighted log odds",
y = "furniture name",
title = "Top 10 Most Represented Names per Furniture Category")
Sellable online?
ikea %>%
count(category, sellable_online, sort = T) %>%
mutate(category = fct_reorder(category, n, sum)) %>%
ggplot(aes(n, category, fill = sellable_online)) +
geom_col() +
labs(x = "# of items",
y = "",
fill = "sellable online?",
title = "Can All Items be Sold Online?")
Now let’s look close on what makes the items not sellable online.
ikea %>%
ggplot(aes(price, fill = sellable_online)) +
geom_histogram() +
scale_x_continuous(label = dollar)
There are just few items that are not sellable online, and the price of these items varies, but on the lower end.
The most disounted items:
ikea %>%
mutate(discount = old_price - price) %>%
filter(discount > 0) %>%
group_by(category) %>%
slice_max(discount, n = 10) %>%
ungroup() %>%
ggplot(aes(old_price, price, color = category)) +
geom_point(aes(size = discount)) +
geom_text(aes(label = paste0(name, "($", discount, ")")),
check_overlap = T,
hjust = 1,
vjust = 1) +
scale_size_continuous(guide = "none") +
labs(x = "price before disount",
y = "price after discount",
title = "Top 10 Discounted Items per Category",
subtitle = "Item names and discount $ are shown as text") +
coord_fixed() +
theme(panel.grid = element_blank())
The more higher the item price is, the more discount it enjoys.
Which category is the most expensive category?
ikea %>%
mutate(category = fct_reorder(category, price)) %>%
ggplot(aes(price, category, fill = category)) +
geom_boxplot(show.legend = F) +
scale_x_log10(label = dollar) +
labs(x = NULL,
title = "Furniture Pricing per Category")
It seems like TV & media furniture is the lowest category on pricing, but the Wardrobes category is the highest one.
Linear regression on price being predicted by detph, height, and width:
ikea %>%
group_by(category) %>%
summarize(model = list(lm(price ~ depth + height + width)),
tidied = map(model, tidy, conf.int = T)) %>%
unnest(tidied) %>%
filter(term != "(Intercept)") %>%
mutate(category = reorder_within(category, estimate, term)) %>%
ggplot(aes(estimate, category, color = term)) +
geom_point() +
geom_vline(xintercept = 0, lty = 2) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
facet_wrap(~term, scales = "free_y") +
scale_y_reordered() +
theme(legend.position = "none",
strip.text = element_text(size = 15),
plot.title = element_text(size = 18)) +
labs(x = "linear regression estimate",
title = "How does Furniture Dimension Affect Price?",
subtitle = "The error bars represent 95% of CI")
set.seed(2022)
ikea %>%
filter(fct_lump(designer, n = 10) != "Other") %>%
count(designer, category, sort = T) %>%
pairwise_cor(designer, category, n) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width = correlation),
alpha = 0.3,
color = "lightblue") +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 0, check_overlap = T) +
scale_edge_width_continuous(range = c(1,3)) +
ggtitle("How Correlated of Designers within Category?",
subtitle = "Top 10 desingers are chosen.")
ikea %>%
count(other_colors, category, sort = T) %>%
pivot_wider(names_from = other_colors,
values_from = n) %>%
mutate(total = Yes + No,
pct_colors = Yes/total,
category = fct_reorder(category, pct_colors)) %>%
ggplot(aes(pct_colors, category)) +
geom_point(aes(size = total), color = "#ffcc00") +
scale_x_continuous(labels = percent) +
scale_size_continuous(breaks = seq(100, 600, 100)) +
labs(x = "% of furniture having multiple colors",
y = NULL,
size = "total # of items",
title = "Category-wise Furniture with Multiple Colors")