Salary Survey Data Visualization

Sun, Apr 24, 2022 4-minute read

This blog post analyzes salary survey from a wide range of perspectives, including race, gender, etc. As usual, the data comes from TidyTuesday.

library(tidyverse)
library(lubridate)
library(scales)
library(broom)
library(geofacet)
theme_set(theme_bw())
survey <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-18/survey.csv') %>%
  filter(!is.na(state),
         !is.na(highest_level_of_education_completed),
         !is.na(race),
         currency == "USD",
         annual_salary > 2000,
         fct_lump(state, n = 51) != "Other") %>%
  mutate(timestamp = mdy_hms(timestamp),
         highest_level_of_education_completed = str_to_title(str_remove_all(highest_level_of_education_completed, "\\s[d|D]egree.*$")),
         race = str_remove_all(race, ",?\\s.*$"),
         race = fct_lump(race, n = 4)) %>%
  rename(age = how_old_are_you,
         education_level = highest_level_of_education_completed) %>% 
  mutate(education_level = factor(education_level, 
                                  levels = c("High School",
                                             "Some College",
                                             "College",
                                             "Master's",
                                             "Phd",
                                             "Professional")),
         education_level = fct_recode(education_level, 
                                      "PhD" = "Phd"))

survey 
## # A tibble: 21,148 x 18
##    timestamp           age   industry   job_title additional_cont~ annual_salary
##    <dttm>              <chr> <chr>      <chr>     <chr>                    <dbl>
##  1 2021-04-27 11:02:10 25-34 Education~ Research~ <NA>                     55000
##  2 2021-04-27 11:02:38 25-34 Accountin~ Marketin~ <NA>                     34000
##  3 2021-04-27 11:02:41 25-34 Nonprofits Program ~ <NA>                     62000
##  4 2021-04-27 11:02:42 25-34 Accountin~ Accounti~ <NA>                     60000
##  5 2021-04-27 11:02:46 25-34 Education~ Scholarl~ <NA>                     62000
##  6 2021-04-27 11:02:51 25-34 Publishing Publishi~ <NA>                     33000
##  7 2021-04-27 11:03:00 25-34 Education~ Librarian High school, FT          50000
##  8 2021-04-27 11:03:01 45-54 Computing~ Systems ~ Data developer/~        112000
##  9 2021-04-27 11:03:02 35-44 Accountin~ Senior A~ <NA>                     45000
## 10 2021-04-27 11:03:07 35-44 Education~ Deputy T~ <NA>                     62000
## # ... with 21,138 more rows, and 12 more variables: other_monetary_comp <dbl>,
## #   currency <chr>, currency_other <chr>, additional_context_on_income <chr>,
## #   country <chr>, state <chr>, city <chr>,
## #   overall_years_of_professional_experience <chr>,
## #   years_of_experience_in_field <chr>, education_level <fct>, gender <chr>,
## #   race <fct>

State-wise salary based on education level:

survey %>%
  ggplot(aes(education_level, annual_salary, fill = education_level, color = education_level)) +
  geom_boxplot(alpha = 0.5) +
  facet_geo(~state) +
  scale_y_log10(labels = dollar) +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank()) +
  labs(x = NULL,
       y = "annual salary",
       fill = "",
       color = "")

Does degree help make more money?

survey %>%
  filter(age != "under 18") %>%
  group_by(age, education_level) %>%
  summarize(avg_salary = mean(annual_salary),
            n = n()) %>%
  ungroup() %>%
  filter(n >= 3) %>%
  ggplot(aes(age, avg_salary, fill = education_level)) +
  geom_col(show.legend = F) +
  facet_wrap(~education_level) +
  scale_y_continuous(labels = dollar) +
  theme(panel.grid = element_blank(),
        strip.text = element_text(size = 13),
        plot.title = element_text(size = 15)) +
  labs(y = "average annual salary",
       title = "Average Annual Salary per Age Group per Education Level")

Linear model for predicting annual salary:

survey %>%
  filter(age != "under 18") %>%
  select(age, annual_salary, education_level, gender, race) %>%
  summarize(model = list(lm(annual_salary ~ ., data = .))) %>%
  mutate(model = map(model, tidy, conf.int = T)) %>%
  unnest(model) %>%
  filter(term != "(Intercept)",
         !str_detect(term, "Prefer not to answer")) %>%
  mutate(term = fct_reorder(term, estimate)) %>%
  ggplot(aes(x = estimate,
             y = term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low,
                     xmax = conf.high),
                 height = 0.2) +
  geom_vline(xintercept = 0, size = 1.2, lty = 2, color = "red") +
  labs(y = NULL,
       title = "Linear Regression Term Estimate and 95% CI on Annual Salary")

T-test on race annual salary:

survey %>%
  filter(age != "under 18") %>%
  select(age, annual_salary, education_level, gender, race) %>%
  nest(data = -race) %>%
  mutate(model = map(data, ~t.test(.$annual_salary)),
         tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(race = fct_reorder(race, estimate)) %>%
  ggplot(aes(estimate, race)) +
  geom_point() +
  geom_errorbarh(
    aes(xmin = conf.low,
        xmax = conf.high),
    height = 0.2
    ) +
  scale_x_continuous(labels = dollar) +
  labs(y = NULL,
       title = "One-sample T-Test on Races Impacting Annual Salary")

Men and women’s annual salary:

survey %>%
  filter(age != "under 18") %>%
  select(age, annual_salary, education_level, gender, race) %>%
  filter(gender %in% c("Man", "Woman")) %>%
  nest(data = -gender) %>%
  mutate(model = map(data, ~t.test(.$annual_salary)),
         tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  ggplot(aes(gender, estimate, fill = gender)) +
  geom_col(alpha = 0.5) +
  geom_errorbar(aes(ymin = conf.low,
                    ymax = conf.high),
                width = 0.5) +
  theme(panel.grid = element_blank(),
        legend.position = "none") +
  labs(x = NULL,
       y = "estimated annual salary",
       title = "Men and Women T-test Estimated Annual Salary") +
  scale_y_continuous(labels = dollar)  

Where does Ph.D. go?

survey %>%
  filter(education_level == "PhD",
         gender %in% c("Man", "Woman")) %>%
  group_by(industry, gender) %>%
  summarize(avg_salary = mean(annual_salary),
            n = n()) %>%
  ungroup() %>%
  filter(n > 5) %>%
  mutate(industry = paste0(industry, "(", n, ")"),
         industry = fct_reorder(industry, avg_salary)) %>%
  ggplot(aes(avg_salary, industry)) +
  geom_col() +
  scale_x_continuous(labels = dollar) +
  labs(x = "average annual salary",
       title = "Where does Ph.D. go?") +
  facet_wrap(~gender, scales = "free_y") +
  theme(strip.text = element_text(size = 13),
        plot.title = element_text(size = 15))