Data Visualization on Nobel Prize Winners
Sun, Oct 31, 2021
4-minute read
Nobel Prize is a world-class prize. In this blog post, the prize winners’ information and prize category will be analyzed. The datasets are from the TidyTuesday Project, which is a wonderful online data science learning community for R users.
Load the packages and the datasets!
library(tidyverse)
library(scales)
library(lubridate)
library(WDI)
library(countrycode)
winner_all_pubs <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winner_all_pubs.csv") %>%
mutate(journal = str_to_title(journal),
affiliation = str_to_title(affiliation),
category = str_to_title(category))
winners <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winners.csv")
winner_all_pubs %>%
group_by(laureate_id, category) %>%
summarize(total_publication = n(),
prize_publication = sum(is_prize_winning_paper == "YES")) %>%
mutate(prize_ratio = prize_publication / total_publication) %>%
ungroup() %>%
ggplot(aes(prize_ratio, category, fill = category)) +
geom_violin(show.legend = F) +
scale_x_continuous(labels = percent) +
labs(x = "prize paper ratio",
title = "Category-wise Prize Paper Ratio",
subtitle = "prize paper ratio between # of winning papers and total # of papers")
winning_papers <- winner_all_pubs %>%
filter(is_prize_winning_paper == "YES")
winning_papers %>%
mutate(wait_year = prize_year - pub_year) %>%
filter(wait_year > 0) %>%
ggplot(aes(wait_year, category, fill = category)) +
geom_boxplot(show.legend = F) +
scale_x_continuous() +
labs(x = "wait year",
title = "How many years did paper publication need to wait to get the Nobel Prize?",
subtitle = "Wait year is defined as prize year minus paper publication year")
Wait median of 10+ years to win the nobel prize after paper publication!
Which paper is the winning paper for the winner?
winner_all_pubs %>%
mutate(pub_decade = 10 * floor(pub_year/10)) %>%
group_by(laureate_name) %>%
mutate(pub_rank = rank(pub_year, ties.method = "first"),
total_pub = n()) %>%
filter(is_prize_winning_paper == "YES") %>%
mutate(winning_percentage = pub_rank / total_pub) %>%
group_by(category, pub_decade) %>%
summarize(avg_win = mean(winning_percentage)) %>%
ggplot(aes(pub_decade, avg_win, color = category)) +
geom_line(size = 1) +
labs(x = "decade",
y = "average winning paper rank in the winner's career",
title = "The Average Winning Paper Rank in each Decade")
Publication Journals
winner_all_pubs %>%
group_by(journal, category) %>%
summarize(winning_paper_count = sum(is_prize_winning_paper == "YES"),
total_paper = n()) %>%
filter(winning_paper_count > 0,
total_paper > 5,
!is.na(journal)) %>%
mutate(ratio = winning_paper_count / total_paper) %>%
arrange(desc(ratio)) %>%
ungroup() %>%
mutate(journal = fct_lump(journal, n = 10, w = ratio)) %>%
filter(journal != "Other") %>%
mutate(journal = fct_reorder(journal, ratio)) %>%
ggplot(aes(ratio, journal, fill = category)) +
geom_col() +
scale_x_continuous(labels = percent) +
labs(x = "winning ratio",
title = "Top 10 Winning Journals with the Highest Winning Ratio")
winners %>%
mutate(birth_year = year(birth_date),
prize_age = prize_year - birth_year,
category = fct_reorder(category, prize_age, median, na.rm = T)) %>%
ggplot(aes(prize_age, category, fill = category)) +
geom_boxplot(show.legend = F) +
labs(x = "age when awarded the Nobel Prize",
title = "Nobel Laureate Age When Receiving the Prize") +
theme(
axis.title = element_text(size = 15),
axis.text = element_text(size = 13),
plot.title = element_text(size = 18)
)
When Nobel Prize winners’ birth country and winning country are same
same_country <- winners %>%
filter(birth_country == organization_country)
same_country %>%
count(birth_country, category, sort = T) %>%
mutate(birth_country = fct_reorder(birth_country, n, sum)) %>%
ggplot(aes(n, birth_country, fill = category)) +
geom_col()
When Nobel Prize winners’ birth country and winning country are different
diff_country <- winners %>%
filter(birth_country != organization_country)
diff_country %>%
count(birth_country, organization_country, sort = T) %>%
group_by(organization_country) %>%
summarize(total = sum(n)) %>%
ungroup() %>%
mutate(organization_country = fct_reorder(organization_country, total)) %>%
ggplot(aes(total, organization_country, fill = organization_country)) +
geom_col(show.legend = F) +
labs(x = "total # of the prize winners born outside of the country",
y = "organization country",
title = "Which country has the most prize winners born outside of the country?")
Immigration Nobel Prize Winners
winners %>%
group_by(organization_country) %>%
summarize(foreign_born_count = sum(birth_country != organization_country),
total_count = n()) %>%
mutate(foreign_born_ratio = foreign_born_count/total_count,
organization_country = fct_reorder(organization_country, foreign_born_ratio)) %>%
filter(foreign_born_ratio > 0) %>%
ggplot(aes(foreign_born_ratio, organization_country, fill = organization_country)) +
geom_col(show.legend = F) +
scale_x_continuous(labels = percent) +
labs(x = "percentage of foreign born winners",
y = NULL,
title = "Percentage of Foreign Born Nobel Prize Winners")
Gender including organizations
winners %>%
mutate(gender = coalesce(gender, laureate_type)) %>%
count(gender, category, sort = T) %>%
drop_na() %>%
complete(gender, category, fill = list(n = 0)) %>%
ggplot(aes(gender, category, fill = n)) +
geom_tile() +
scale_fill_gradient2(
high = "green",
low = "red",
mid = "white",
midpoint = 100
) +
scale_x_discrete(expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
labs(x = NULL,
y = NULL,
fill = "# of prize winners",
title = "Total # of Nobel Prize Winners Across All Categories between Men, Women & Organization",
subtitle = "Organization was only awarded Peace Prize")
Birth Country’s economic situation
economy <- WDI(indicator = c("NY.GDP.PCAP.KD"), start = 2016, end = 2016, extra = T) %>%
as_tibble()
winners %>%
mutate(country_code = countrycode(birth_country, origin = 'country.name', destination = 'iso2c')) %>%
inner_join(economy, by = c(country_code = "iso2c")) %>%
count(category, income, sort = T) %>%
mutate(income = fct_relevel(income, levels = c("Low income", "Lower middle income", "Upper middle income", "High income"))) %>%
ggplot(aes(n, income, fill = income)) +
geom_col(show.legend = F) +
facet_wrap(~category) +
theme(strip.text = element_text(size = 15, face = "bold"),
plot.title = element_text(size = 16)) +
labs(x = "# of Nobel Prize winners",
y = NULL,
title = "# of Nobel Prize Winners Birth Country Ecomnomic Level")