U.S. PhD Data Analysis with Extensive Data Cleaning/Processing
Sat, Oct 2, 2021
5-minute read
The data context of this blog post is interesting, as it is about PhD graduates in the U.S. from various years. As a Ph.D. student majoring in Data Science, this is revelant to me, although Data Science is not shown in the datasets.
The clean dataset is from TidyTuesday, and later I will use the raw datasets from NSF to carry out extensive data cleaning, which can shed some light on data preprocessing steps from the EXCEL files.
library(tidyverse)
library(patchwork)
library(readxl)
phd <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-02-19/phd_by_field.csv") %>%
mutate(broad_field = str_to_title(broad_field),
year = as.integer(year))
phd
## # A tibble: 3,370 x 5
## broad_field major_field field year n_phds
## <chr> <chr> <chr> <int> <dbl>
## 1 Life Sciences Agricultural sciences and~ Agricultural economics 2008 111
## 2 Life Sciences Agricultural sciences and~ Agricultural and horti~ 2008 28
## 3 Life Sciences Agricultural sciences and~ Agricultural animal br~ 2008 3
## 4 Life Sciences Agricultural sciences and~ Agronomy and crop scie~ 2008 68
## 5 Life Sciences Agricultural sciences and~ Animal nutrition 2008 41
## 6 Life Sciences Agricultural sciences and~ Animal science, poultr~ 2008 18
## 7 Life Sciences Agricultural sciences and~ Animal sciences, other 2008 77
## 8 Life Sciences Agricultural sciences and~ Environmental science 2008 182
## 9 Life Sciences Agricultural sciences and~ Fishing and fisheries ~ 2008 52
## 10 Life Sciences Agricultural sciences and~ Food science 2008 96
## # ... with 3,360 more rows
p1 <- phd %>%
group_by(broad_field, year) %>%
summarize(total_phd = sum(n_phds, na.rm = T)) %>%
ungroup() %>%
mutate(broad_field = fct_reorder(broad_field, -total_phd, sum)) %>%
ggplot(aes(year, total_phd, color = broad_field)) +
geom_line(size = 1) +
scale_x_continuous(breaks = seq(2008, 2020)) +
theme(
plot.title = element_text(size = 18),
axis.text = element_text(size = 13),
axis.title = element_text(size = 15),
legend.position = "top"
) +
labs(x = NULL,
y = "# of Ph.D.",
color = "broad field",
title = "# of Ph.D. across Various Broad Fields")
## `summarise()` has grouped output by 'broad_field'. You can override using the `.groups` argument.
p2 <- phd %>%
group_by(year) %>%
summarize(total_phd = sum(n_phds, na.rm = T)) %>%
ggplot(aes(year, total_phd)) +
geom_line() +
scale_x_continuous(breaks = seq(2008, 2020)) +
theme(
plot.title = element_text(size = 18),
axis.text = element_text(size = 13),
axis.title = element_text(size = 15)
) +
labs(x = NULL,
y = "# of Ph.D.",
color = "broad field",
title = "Total # of Ph.D.")
p2 / p1
Analyzing The Raw Data
The following EXCEL files are downloaded from NSF, and they are Table 12, 13 and 14.
Doctorate recipients, by major field of study: Selected years, 1987–2017
major_field <- read_xlsx("sed17-sr-tab012.xlsx", skip = 3) %>%
rename(field = 1)
major_field_processed <- major_field %>%
pivot_longer(-field) %>%
mutate(name = ifelse(str_detect(name, "\\.\\.\\."), NA, name)) %>%
fill(name) %>%
drop_na() %>%
rename(year = name,
num_of_phd = value) %>%
mutate(year = as.numeric(year),
num_of_phd = as.numeric(num_of_phd)) %>%
filter(num_of_phd > 100) %>%
mutate(field = ifelse(str_detect(field, "Other"), "Other", field),
field = str_to_title(field))
major_field_processed %>%
filter(field != "All Fields") %>%
complete(field, year, fill = list(0)) %>%
ggplot(aes(year, field, fill = num_of_phd)) +
geom_tile() +
scale_fill_gradient2(low = "blue",
high = "red",
mid = "white",
midpoint = mean(major_field_processed$num_of_phd)) +
scale_x_continuous(expand = c(0,0),
breaks = seq(1987, 2017, 2)) +
theme(
plot.title = element_text(size = 18),
axis.text = element_text(size = 13)
) +
labs(x = NULL,
y = NULL,
fill = "# of PhDs",
title = "# of PhDs Produced by Various Fields")
Doctorate recipients, by fine field of study: 2008–17
field <- read_xlsx("sed17-sr-tab013.xlsx", skip = 3) %>%
rename(field = 1)
field_processed <- field %>%
pivot_longer(-field, names_to = "year", values_to = "num_of_phds") %>%
mutate(year = as.numeric(year),
num_of_phds = as.numeric(num_of_phds))
top_7_fields <- field_processed %>%
filter(field != "All fields") %>%
group_by(field) %>%
summarize(total_num = sum(num_of_phds)) %>%
arrange(desc(total_num)) %>%
select(field) %>%
head(7) %>%
pull()
field_processed %>%
filter(field %in% top_7_fields) %>%
mutate(field = fct_reorder(field, -num_of_phds, sum)) %>%
ggplot(aes(year, num_of_phds, color = field)) +
geom_line(size = 1) +
scale_x_continuous(breaks = seq(2008, 2020, by = 2)) +
labs(x = NULL,
y = "# of PhDs",
title = "# of PhDs from Top 7 Majors",
subtitle = "Top Majors are computed based on total # of PhDs")
Doctorate recipients, by broad field of study and sex: Selected years, 1987–2017
field_gender <- read_xlsx("sed17-sr-tab014.xlsx", skip = 3) %>%
rename(field = 1)
## New names:
## * `` -> ...3
## * `` -> ...5
## * `` -> ...7
## * `` -> ...9
## * `` -> ...11
## * ...
field_gender_processed <- field_gender %>%
pivot_longer(-field, names_to = "year", values_to = "num_of_phds") %>%
mutate(year = ifelse(str_detect(year, "\\.\\.\\."), NA, year)) %>%
fill(year) %>%
drop_na() %>%
mutate(year = as.numeric(year),
num_of_phds = as.numeric(num_of_phds)) %>%
filter(num_of_phds > 100)
fields_filtered <-field_gender_processed %>%
filter(!field %in% c("Male", "Female")) %>%
select(field) %>%
distinct() %>%
pull()
field_gender_processed %>%
mutate(field_of_study = ifelse(field %in% fields_filtered, field, NA)) %>%
fill(field_of_study) %>%
rename(gender = field) %>%
mutate(gender = ifelse(gender == field_of_study, "Total", gender)) %>%
mutate(field_of_study = fct_recode(field_of_study,
"All field" = "All fieldsa",
"Life sciences" = "Life sciencesb",
"Other" = "Otherc")) %>%
filter(gender != "Total",
field_of_study != "All field") %>%
mutate(field_of_study = fct_reorder(field_of_study, -num_of_phds, sum),
field_of_study = str_to_title(field_of_study),
field_of_study = str_replace(field_of_study, "And", "&")) %>%
ggplot(aes(year, num_of_phds, color = field_of_study)) +
geom_point() +
geom_line(size = 1) +
facet_wrap(~gender, ncol = 1) +
theme(
strip.text = element_text(size = 15, face = "bold"),
plot.title = element_text(size = 18),
axis.title = element_text(size = 15),
axis.text = element_text(size = 11)
) +
scale_x_continuous(breaks = seq(1987, 2015)) +
labs(x = NULL,
y = "# of PhDs",
color = "field of study",
title = "# of PhDs from Various Fields of Study among Men & Women")